[What are DataFrames](https://www.youtube.com/live/OG31yhRQxPI?si=43TM_mXSlIQFqlLq)
- A single row and column of dataFrame is a series

In [118]:
import numpy as np
import pandas as pd

# Creating DataFrames

In [119]:
# using lists
languages_perferences = [
    ['Python',1992,'Golden💛'], # first row 
    ['JS',1995,'WebGod🕸️'], # second row 
    ['C++',1986,'Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻'], # third row 
    ['C',1980,'Creator⚡'], # fourth row 
]                                            #  C1        C2        C3       (C-> column)
pd.DataFrame(languages_perferences,columns=['Language','Year','Feeling'])



Unnamed: 0,Language,Year,Feeling
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


In [120]:
# using dictionary
# key will be column name, rows will be values
languages_perferences = {
    'Language': ['Python', 'JS', 'C++', 'C'],
    'Year': [1992, 1995, 1986, 1980],
    'Feeling':['Golden💛','WebGod🕸️','Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻','Creator⚡']
}

pd.DataFrame(languages_perferences)

Unnamed: 0,Language,Year,Feeling
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


In [121]:
# you can also create dataframes, that only contain columns, it's kind of like first you can initialize then you can assign values (rows here)
import pandas as pd
empty_df = pd.DataFrame(columns=['Language','Year','Feeling'])
empty_df

Unnamed: 0,Language,Year,Feeling


In [122]:
# now you can assign rows
empty_df.loc[1] = ['Python',1992,'Golden💛'] # first row
empty_df

Unnamed: 0,Language,Year,Feeling
1,Python,1992,Golden💛


In [123]:
# for converting dataframe to csv file
# pd.DataFrame(languages_perferences).to_csv('languages_perferences.csv',index=False,sep=',') # for saving in csv file

langs = pd.read_csv(r'languages_perferences.csv')
langs

Unnamed: 0,Language,Year,Feeling
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


# DataFrame Attributes and Methods

In [124]:
# shape
langs.shape # means data is divided into 4 rows and 3 columns

(4, 3)

In [125]:
# dtypes
# gives datatype of each column(series)
langs.dtypes

Language    object
Year         int64
Feeling     object
dtype: object

In [126]:
# index
langs.index

RangeIndex(start=0, stop=4, step=1)

In [127]:
# columns
langs.columns

Index(['Language', 'Year', 'Feeling'], dtype='object')

In [128]:
# values
langs.values # give values of each row

array([['Python', 1992, 'Golden💛'],
       ['JS', 1995, 'WebGod🕸️'],
       ['C++', 1986, 'Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻'],
       ['C', 1980, 'Creator⚡']], dtype=object)

In [129]:
# head and tail 
langs.head(-2) # -2 means last 2 rows will be hidden
# same can be done with tail


Unnamed: 0,Language,Year,Feeling
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️


In [130]:
langs.tail(-2) # means 2 above rows will be hidden

Unnamed: 0,Language,Year,Feeling
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


In [131]:
# sample # 2 random rows
langs.sample(2)

Unnamed: 0,Language,Year,Feeling
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️


In [132]:
# count 
langs.count() # total number of non-null values in each column

Language    4
Year        4
Feeling     4
dtype: int64

In [133]:
# Understand `info` -> https://www.youtube.com/live/OG31yhRQxPI?si=UVygY80TQSqfueu7&t=1471
langs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Language  4 non-null      object
 1   Year      4 non-null      int64 
 2   Feeling   4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes


In [134]:
# describe, is used to extract numerical columns only
langs.describe()

Unnamed: 0,Year
count,4.0
mean,1988.25
std,6.652067
min,1980.0
25%,1984.5
50%,1989.0
75%,1992.75
max,1995.0


In [135]:
# isnull, gives a boolean dataframe
print(langs.isnull().sum()) # here sum will add all true values as 1, and give us result of each column, basically sum will sum all columns one by one
langs.isnull()

Language    0
Year        0
Feeling     0
dtype: int64


Unnamed: 0,Language,Year,Feeling
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [136]:
# duplicated
print(langs.duplicated()) # will return true if a row is duplicate, else false
print(langs.duplicated().sum()) # total number of duplicate rows, basically its summation of all columns one by one

0    False
1    False
2    False
3    False
dtype: bool
0


In [137]:
# rename column names

langs.rename(columns={'Language':'Language Name', 'Year':'Year of Release', 'Feeling':'Tag'}, inplace=True)
print(langs.columns)
langs

Index(['Language Name', 'Year of Release', 'Tag'], dtype='object')


Unnamed: 0,Language Name,Year of Release,Tag
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


In [138]:
langs.set_index('Language Name') # to make a column as index

Unnamed: 0_level_0,Year of Release,Tag
Language Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Python,1992,Golden💛
JS,1995,WebGod🕸️
C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
C,1980,Creator⚡


# Mathematical Functions

In [139]:
# sum -> axis argument

new_df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]

}, dtype=np.int64)

print('Axis = 0\n',new_df.sum(axis=0),'\n') # sum of each column, by default, axis=0
print('Axis = 1\n',new_df.sum(axis=1),'\n') # sum of each row
new_df



Axis = 0
 A     6
B    15
C    24
dtype: int64 

Axis = 1
 0    12
1    15
2    18
dtype: int64 



Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [140]:
# mean

print('Axis = 0\n',new_df.mean(axis=0),'\n') # mean of each column
print('Axis = 1\n',new_df.mean(axis=1),'\n') # mean of each row

new_df

Axis = 0
 A    2.0
B    5.0
C    8.0
dtype: float64 

Axis = 1
 0    4.0
1    5.0
2    6.0
dtype: float64 



Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [141]:
# min , max , variance(new_df.var()) , std (new_df.std())

print('Axis = 0\n',new_df.min(axis=0),'\n') # min of each column
print('Axis = 1\n',new_df.max(axis=1),'\n') # max of each row

new_df
# max

Axis = 0
 A    1
B    4
C    7
dtype: int64 

Axis = 1
 0    7
1    8
2    9
dtype: int64 



Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


# Indexing and Slicing in DataFrame

In [142]:
new_df

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [143]:
# fetch specific column
new_df[['A', 'C']] 
# but when you do boolean indexing, it will do that on rows instead of columns

Unnamed: 0,A,C
0,1,7
1,2,8
2,3,9


In [144]:
# iloc, to get rows and columns by indexes(these indexes is internally managed by pandas, its not the indexes that we see on left most side of dataframe, or the names of columns that we see on top)
# these indexes are given by pandas automatically to rows and columns

# single row
print(langs.iloc[0],'\n')
print(langs.iloc[0,-1],'\n')
langs

Language Name       Python
Year of Release       1992
Tag                Golden💛
Name: 0, dtype: object 

Golden💛 



Unnamed: 0,Language Name,Year of Release,Tag
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻
3,C,1980,Creator⚡


In [145]:
# multiple rows
langs.iloc[:2,:2]

Unnamed: 0,Language Name,Year of Release
0,Python,1992
1,JS,1995


In [146]:
# fancy indexing
# pandas fancy indexing work little bit differently from numpy, because in numpy all list of indexes should have same len in fancy indexing(unless the first list not contains single index)
""" a = np.arange(10).reshape(2,5) # reshape the array into 2 rows and 5 columns
a[[0,1],[1,2]] , so all list of indexes should have same len in fancy indexing(unless the first list not contains single index)"""

print(langs.iloc[[0,-1],[0,0,-1]]) 

  Language Name Language Name       Tag
0        Python        Python   Golden💛
3             C             C  Creator⚡


In [147]:
langs.iloc[[True, False, True, False], :] # fancy indexing with boolean indexing using iloc
# but Note: iloc can only do boolean indexing when we input it array or list of boolean values, not boolean series(for that use `loc`)
# Example:

langs.iloc[(langs['Year of Release']> 1990).values ,-1] # can only take boolean array or list
langs.loc[(langs['Year of Release']> 1990) ,'Tag'] # can take series,array or list of boolean values

0     Golden💛
1    WebGod🕸️
Name: Tag, dtype: object

In [148]:
# in loc, we use labels(that we see left most side of dataframe) instead of index for fetch rows
# in loc, we use names of columns(instead of indexes that automatically give by pandas) to fetch columns 

newLangs = langs.set_index('Language Name') # language name as index
newLangs.loc[['Python', 'JS']]

Unnamed: 0_level_0,Year of Release,Tag
Language Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Python,1992,Golden💛
JS,1995,WebGod🕸️


In [149]:
# slicing using loc, loc is inclusive of both start and end index
newLangs.loc['JS':'C':2]  

Unnamed: 0_level_0,Year of Release,Tag
Language Name,Unnamed: 1_level_1,Unnamed: 2_level_1
JS,1995,WebGod🕸️
C,1980,Creator⚡


In [150]:
# fancy indexing using loc

newLangs.loc[['Python', 'JS'], ['Year of Release']]

# newLangs.reset_index(drop=True) # again we can reset index using `.reset_index(drop=True)`

Unnamed: 0_level_0,Year of Release
Language Name,Unnamed: 1_level_1
Python,1992
JS,1995


In [151]:
# you can do boolean indexing with loc also
print(langs['Year of Release'] > 1990)
langs.loc[langs['Year of Release'] > 1990] # filtering data

0     True
1     True
2    False
3    False
Name: Year of Release, dtype: bool


Unnamed: 0,Language Name,Year of Release,Tag
0,Python,1992,Golden💛
1,JS,1995,WebGod🕸️


In [152]:
# iloc vs loc main difference -> https://www.youtube.com/live/OG31yhRQxPI?si=5B5dVmJprCluLCG0&t=3821

In [153]:
# selection both rows and columns
# we want to select `C` tag
langs.iloc[-1,-1]

'Creator⚡'

In [154]:
# same thing with loc
newLangs.loc['C','Tag']

'Creator⚡'

# Filtering a DataFrame
[See this video clip to practice](https://www.youtube.com/live/OG31yhRQxPI?si=qFR2QCyXq8KBoLU9&t=4325)


In [155]:
pd.Series(langs.iloc[:,-2])

0    1992
1    1995
2    1986
3    1980
Name: Year of Release, dtype: int64

In [156]:
# str is used to conver conver series values to string so we can perform string operations on those values

langs['Language Name'].str.contains('C',case=False)

0    False
1    False
2     True
3     True
Name: Language Name, dtype: bool

In [157]:



langs.iloc[(langs['Year of Release']> 1990).values ,-1]
langs.loc[langs['Year of Release']> 1990 ,'Tag']

0     Golden💛
1    WebGod🕸️
Name: Tag, dtype: object

In [158]:



langs.iloc[(langs['Year of Release']> 1990).values ,-1]
langs.loc[(langs['Year of Release']> 1990) ,'Tag']

0     Golden💛
1    WebGod🕸️
Name: Tag, dtype: object

## Adding a new columns

In [159]:
# completely new columns
langs['Authors'] = ['Guido van Rossum', 'Brendan Eich', 'Bjarne Stroustrup', 'Dennis Ritchie']
langs

Unnamed: 0,Language Name,Year of Release,Tag,Authors
0,Python,1992,Golden💛,Guido van Rossum
1,JS,1995,WebGod🕸️,Brendan Eich
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻,Bjarne Stroustrup
3,C,1980,Creator⚡,Dennis Ritchie


In [160]:
# create columns from  existing ones
langs.dropna(subset=['Authors']) # remove rows where Tag is NaN



Unnamed: 0,Language Name,Year of Release,Tag,Authors
0,Python,1992,Golden💛,Guido van Rossum
1,JS,1995,WebGod🕸️,Brendan Eich
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻,Bjarne Stroustrup
3,C,1980,Creator⚡,Dennis Ritchie


# Important DataFrame Methods:

In [161]:
langs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Language Name    4 non-null      object
 1   Year of Release  4 non-null      int64 
 2   Tag              4 non-null      object
 3   Authors          4 non-null      object
dtypes: int64(1), object(3)
memory usage: 260.0+ bytes


In [162]:
# agg, you can perform aggregation on DataFrame
# pass list
langs.agg(['min','max']) # gives you min and max of each column

# you can also pass dict  
# means min and max of Year of Release and count and min of Tag columns
langs.agg({'Year of Release': ['min','max'], 'Tag': ['count','min']}) # gives you min, max, count, mean of Year of Release and Tag columns

Unnamed: 0,Year of Release,Tag
min,1980.0,Creator⚡
max,1995.0,
count,,4


In [163]:
# astype, not modified original dataframe
langs.astype(str) # convert all columns to string
langs['Year of Release'].astype('Int16') # convert Year of Release to integer and replace old column with new one

# change in original dataframe

langs['Year of Release'] = langs['Year of Release'].astype('Int16')
langs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Language Name    4 non-null      object
 1   Year of Release  4 non-null      Int16 
 2   Tag              4 non-null      object
 3   Authors          4 non-null      object
dtypes: Int16(1), object(3)
memory usage: 240.0+ bytes


# Appending to dataframes

In [164]:
import pandas as pd

# Original DataFrame
df = pd.DataFrame({'Name': ['Alice', 'Bob'], 'Age': [25, 30]})

# Row to append
new_row = {'Name': 'Charlie', 'Age': 22}

# Append using concat
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

print(df)


      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   22


In [165]:

import pandas as pd

new_df706 = pd.DataFrame({
    'Language Name': ['Go'],
    'Year of Release': [2009],
    'Feeling': ['New']
})


new_df706

Unnamed: 0,Language Name,Year of Release,Feeling
0,Go,2009,New


In [166]:
pd.concat([langs, new_df706]) # concatenate two dataframes by concatenating their rows


Unnamed: 0,Language Name,Year of Release,Tag,Authors,Feeling
0,Python,1992,Golden💛,Guido van Rossum,
1,JS,1995,WebGod🕸️,Brendan Eich,
2,C++,1986,Pointers👈🏻👉🏻👈🏻🫵🏻👆🏻👇🏻,Bjarne Stroustrup,
3,C,1980,Creator⚡,Dennis Ritchie,
0,Go,2009,,,New
