[What are DataFrames](https://www.youtube.com/live/OG31yhRQxPI?si=43TM_mXSlIQFqlLq)
- A single row and column of dataFrame is a series

In [91]:
import numpy as np
import pandas as pd

# Creating DataFrames

In [92]:
# using lists
languages_perferences = [
    ['Python',1992,'Golden💛'], # first row 
    ['JS',1995,'WebGod🕸️'], # second row 
    ['C++',1986,'Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻'], # third row 
    ['C',1980,'Creator⚡'], # fourth row 
]                                            #  C1        C2        C3       (C-> column)
pd.DataFrame(languages_perferences,columns=['Language','Year','Feeling'])



Unnamed: 0,Language,Year,Feeling
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


In [93]:
# using dictionary
# key will be column name, rows will be values
languages_perferences = {
    'Language': ['Python', 'JS', 'C++', 'C'],
    'Year': [1992, 1995, 1986, 1980],
    'Feeling':['Golden💛','WebGod🕸️','Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻','Creator⚡']
}

pd.DataFrame(languages_perferences)

Unnamed: 0,Language,Year,Feeling
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


In [94]:
# for converting dataframe to csv file
# pd.DataFrame(languages_perferences).to_csv('languages_perferences.csv',index=False,sep=',') # for saving in csv file

langs = pd.read_csv(r'languages_perferences.csv')
langs

Unnamed: 0,Language,Year,Feeling
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


# DataFrame Attributes and Methods

In [95]:
# shape
langs.shape # means data is divided into 4 rows and 3 columns

(4, 3)

In [96]:
# dtypes
# gives datatype of each column(series)
langs.dtypes

Language    object
Year         int64
Feeling     object
dtype: object

In [97]:
# index
langs.index

RangeIndex(start=0, stop=4, step=1)

In [98]:
# columns
langs.columns

Index(['Language', 'Year', 'Feeling'], dtype='object')

In [99]:
# values
langs.values # give values of each row

array([['Python', 1992, 'Golden💛'],
       ['JS', 1995, 'WebGod🕸️'],
       ['C++', 1986, 'Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻'],
       ['C', 1980, 'Creator⚡']], dtype=object)

In [100]:
# head and tail 
langs.head(-2) # -2 means last 2 rows will be hidden
# same can be done with tail


Unnamed: 0,Language,Year,Feeling
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️


In [101]:
langs.tail(-2) # means 2 above rows will be hidden

Unnamed: 0,Language,Year,Feeling
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


In [102]:
# sample # 2 random rows
langs.sample(2)

Unnamed: 0,Language,Year,Feeling
3,C,1980,Creator⚡
0,Python,1992,Golden💛


In [103]:
# Understand `info` -> https://www.youtube.com/live/OG31yhRQxPI?si=UVygY80TQSqfueu7&t=1471
langs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Language  4 non-null      object
 1   Year      4 non-null      int64 
 2   Feeling   4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes


In [104]:
# describe, is used to extract numerical columns only
langs.describe()

Unnamed: 0,Year
count,4.0
mean,1988.25
std,6.652067
min,1980.0
25%,1984.5
50%,1989.0
75%,1992.75
max,1995.0


In [105]:
# isnull, gives a boolean dataframe
print(langs.isnull().sum()) # here sum will add all true values as 1, and give us result of each column, basically sum will sum all columns one by one
langs.isnull()

Language    0
Year        0
Feeling     0
dtype: int64


Unnamed: 0,Language,Year,Feeling
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [106]:
# duplicated
print(langs.duplicated()) # will return true if a row is duplicate, else false
print(langs.duplicated().sum()) # total number of duplicate rows, basically its summation of all columns one by one

0    False
1    False
2    False
3    False
dtype: bool
0


In [107]:
# rename column names

langs.rename(columns={'Language':'Language Name', 'Year':'Year of Release', 'Feeling':'Tag'}, inplace=True)
print(langs.columns)
langs

Index(['Language Name', 'Year of Release', 'Tag'], dtype='object')


Unnamed: 0,Language Name,Year of Release,Tag
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


In [108]:
langs.set_index('Language Name') # to make a column as index

Unnamed: 0_level_0,Year of Release,Tag
Language Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Python,1992,Golden💛
JS,1995,WebGod🕸️
C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
C,1980,Creator⚡


# Mathematical Functions

In [109]:
# sum -> axis argument

new_df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]

}, dtype=np.int64)

print('Axis = 0\n',new_df.sum(axis=0),'\n') # sum of each column, by default, axis=0
print('Axis = 1\n',new_df.sum(axis=1),'\n') # sum of each row
new_df



Axis = 0
 A     6
B    15
C    24
dtype: int64 

Axis = 1
 0    12
1    15
2    18
dtype: int64 



Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [110]:
# mean

print('Axis = 0\n',new_df.mean(axis=0),'\n') # mean of each column
print('Axis = 1\n',new_df.mean(axis=1),'\n') # mean of each row

new_df

Axis = 0
 A    2.0
B    5.0
C    8.0
dtype: float64 

Axis = 1
 0    4.0
1    5.0
2    6.0
dtype: float64 



Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [111]:
# min , max , variance(new_df.var()) , std (new_df.std())

print('Axis = 0\n',new_df.min(axis=0),'\n') # min of each column
print('Axis = 1\n',new_df.max(axis=1),'\n') # max of each row

new_df
# max

Axis = 0
 A    1
B    4
C    7
dtype: int64 

Axis = 1
 0    7
1    8
2    9
dtype: int64 



Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


Unnamed: 0,A,C
0,1,7
1,2,8
2,3,9


# Indexing and Slicing in DataFrame

In [135]:
new_df

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [136]:
# fetch specific column
new_df[['A', 'C']]

Unnamed: 0,A,C
0,1,7
1,2,8
2,3,9


In [None]:
# iloc, to get rows by indexes(these indexes is internally managed by pandas, its not the indexes that we see on left most side of dataframe)
""" a = np.arange(10).reshape(2,5) # reshape the array into 2 rows and 5 columns
a[[0,1],[1,2]] , so all list of indexes should have same len in fancy indexing(unless the first list not contains single index)"""

# single row
print(langs.iloc[0],'\n')
print(langs.iloc[0,-1],'\n')
langs

Language Name       Python
Year of Release       1992
Tag                Golden💛
Name: 0, dtype: object 

Golden💛 



Unnamed: 0,Language Name,Year of Release,Tag
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


In [147]:
# multiple rows
langs.iloc[:2,:2]

Unnamed: 0,Language Name,Year of Release
0,Python,1992
1,JS,1995


In [None]:
# fancy indexing
# pandas fancy indexing work little bit differently from numpy, because in numpy all list of indexes should have same len in fancy indexing(unless the first list not contains single index)

print(langs.iloc[[0,-1],[0,0,-1]]) 

  Language Name Language Name       Tag
0        Python        Python   Golden💛
3             C             C  Creator⚡


In [None]:
langs.iloc[[True, False, True, False], :] # fancy indexing with boolean indexing using iloc

Unnamed: 0,Language Name,Year of Release,Tag
0,Python,1992,Golden💛
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻


In [120]:
# loc, we can use labels(that we see left most side of dataframe) instead of index

newLangs = langs.set_index('Language Name') # language name as index
newLangs.loc[['Python', 'JS']]

Unnamed: 0_level_0,Year of Release,Tag
Language Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Python,1992,Golden💛
JS,1995,WebGod🕸️


In [121]:
# slicing using loc, loc is inclusive of both start and end index
newLangs.loc['JS':'C':2]  

Unnamed: 0_level_0,Year of Release,Tag
Language Name,Unnamed: 1_level_1,Unnamed: 2_level_1
JS,1995,WebGod🕸️
C,1980,Creator⚡


In [122]:
# fancy indexing using loc

newLangs.loc[['Python', 'JS'], ['Year of Release']]

newLangs.reset_index(drop=True) # again we can reset index using `.reset_index(drop=True)`

Unnamed: 0,Year of Release,Tag
0,1992,Golden💛
1,1995,WebGod🕸️
2,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,1980,Creator⚡


In [None]:
# you boolean indexing with loc also
print(langs['Year of Release'] > 1990)
langs.loc[langs['Year of Release'] > 1990] # filtering data

0     True
1     True
2    False
3    False
Name: Year of Release, dtype: bool


Unnamed: 0,Language Name,Year of Release,Tag
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️


In [None]:
# iloc vs loc main difference -> https://www.youtube.com/live/OG31yhRQxPI?si=5B5dVmJprCluLCG0&t=3821