## Data Manipulation and Analysis with Pandas

Data Manipulation and Analysis are the important tasks in Data Science and Data Analytics projects. As it provide provide range of the function for data manipulation which is necessary to clean, structure, organize and transform the raw data so as to extract the insights which will be later used for Machine Learning models trainings.

Let us cover the data manipulation and analysis techniques using Pandas

In [2]:
import pandas as pd

In [3]:
#Let us load the csv data first
df = pd.read_csv('data.csv')

#let fetch the first 5 rows 
df.head(5)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Nissan,Black,10148.0,4.0,Rs1366850
1,Toyota,White,10217.0,4.0,Rs1144150
2,Toyota,White,10247.0,4.0,Rs1628300
3,Honda,White,10953.0,4.0,Rs831800
4,Nissan,Blue,10954.0,4.0,Rs1521950


In [4]:
#lets fetch the last five rows
df.tail(5)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
995,Toyota,White,,4.0,Rs1706650
996,Toyota,White,,4.0,Rs1767900
997,BMW,White,,5.0,Rs1906400
998,Toyota,White,,4.0,
999,Toyota,White,,4.0,


In [5]:
#This will tell the datatype of each column
df.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price             object
dtype: object

In [6]:
#To describe the whole datasets
df.describe()

Unnamed: 0,Odometer (KM),Doors
count,950.0,950.0
mean,131253.237895,4.011579
std,69094.857187,0.382539
min,10148.0,3.0
25%,70391.25,4.0
50%,131821.0,4.0
75%,192668.5,4.0
max,249860.0,5.0


In [7]:
#Handling missing values
df.isnull() #It will check whether for each (row, col) value is missing or not and corresponding return bool values true/false

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
995,False,False,True,False,False
996,False,False,True,False,False
997,False,False,True,False,False
998,False,False,True,False,True


In [8]:
#now if we want to check whether there is any missing values in each column of dataset we use: .any() it will give a Series data structure providing bool values true if present false if not
df.isnull().any()

Make             True
Colour           True
Odometer (KM)    True
Doors            True
Price            True
dtype: bool

In [None]:
#now we want to check the missing values corresponding to each record(row) use: .any(axis=1)
df.isnull().any(axis=1)

0      False
1      False
2      False
3      False
4      False
       ...  
995     True
996     True
997     True
998     True
999     True
Length: 1000, dtype: bool

In [10]:
#if we want to find the sum of all the missing values corresponding to each column
df.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [11]:
#if we want to find the sum of all the missing values corresponding to each record(row)
df.isnull().sum(axis=1)

0      0
1      0
2      0
3      0
4      0
      ..
995    1
996    1
997    1
998    2
999    2
Length: 1000, dtype: int64

In [12]:
df.tail(20)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
980,Honda,White,,4.0,Rs1015300
981,Toyota,White,,4.0,Rs1026050
982,Nissan,Red,,4.0,Rs1083950
983,Honda,Blue,,,Rs1102100
984,Honda,White,,4.0,Rs1137300
985,Honda,Blue,,4.0,Rs1192000
986,Toyota,White,,4.0,Rs1192200
987,Nissan,White,,3.0,Rs1193400
988,Honda,White,,4.0,Rs1340650
989,Honda,Green,,4.0,Rs1429950


In [13]:
#now we want to replace the missing values with 0 within the whole dataset
df_filled = df.fillna(0) #we have created the copy of the dataset with handling the missing values
df_filled.tail(20)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
980,Honda,White,0.0,4.0,Rs1015300
981,Toyota,White,0.0,4.0,Rs1026050
982,Nissan,Red,0.0,4.0,Rs1083950
983,Honda,Blue,0.0,0.0,Rs1102100
984,Honda,White,0.0,4.0,Rs1137300
985,Honda,Blue,0.0,4.0,Rs1192000
986,Toyota,White,0.0,4.0,Rs1192200
987,Nissan,White,0.0,3.0,Rs1193400
988,Honda,White,0.0,4.0,Rs1340650
989,Honda,Green,0.0,4.0,Rs1429950


In [14]:
#replacing the missing values with the mean of each column
df['Price_fillna'] = df['Doors'].fillna(df['Doors'].mean())
df.tail(20)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price,Price_fillna
980,Honda,White,,4.0,Rs1015300,4.0
981,Toyota,White,,4.0,Rs1026050,4.0
982,Nissan,Red,,4.0,Rs1083950,4.0
983,Honda,Blue,,,Rs1102100,4.011579
984,Honda,White,,4.0,Rs1137300,4.0
985,Honda,Blue,,4.0,Rs1192000,4.0
986,Toyota,White,,4.0,Rs1192200,4.0
987,Nissan,White,,3.0,Rs1193400,3.0
988,Honda,White,,4.0,Rs1340650,4.0
989,Honda,Green,,4.0,Rs1429950,4.0


In [15]:
#If we want to change the name of any column
df = df.rename(columns={'Make': 'Car Company'}) #rename should be done in the form of dictionary 
df.head()

Unnamed: 0,Car Company,Colour,Odometer (KM),Doors,Price,Price_fillna
0,Nissan,Black,10148.0,4.0,Rs1366850,4.0
1,Toyota,White,10217.0,4.0,Rs1144150,4.0
2,Toyota,White,10247.0,4.0,Rs1628300,4.0
3,Honda,White,10953.0,4.0,Rs831800,4.0
4,Nissan,Blue,10954.0,4.0,Rs1521950,4.0


In [16]:
df.dtypes

Car Company       object
Colour            object
Odometer (KM)    float64
Doors            float64
Price             object
Price_fillna     float64
dtype: object

In [17]:
#If we want to change the datatype of any column
df['New Doors'] = df['Doors'].fillna(df['Doors'].mean()).astype(int)
print(df.head())
print(df.dtypes)

  Car Company Colour  Odometer (KM)  Doors      Price  Price_fillna  New Doors
0      Nissan  Black        10148.0    4.0  Rs1366850           4.0          4
1      Toyota  White        10217.0    4.0  Rs1144150           4.0          4
2      Toyota  White        10247.0    4.0  Rs1628300           4.0          4
3       Honda  White        10953.0    4.0   Rs831800           4.0          4
4      Nissan   Blue        10954.0    4.0  Rs1521950           4.0          4
Car Company       object
Colour            object
Odometer (KM)    float64
Doors            float64
Price             object
Price_fillna     float64
New Doors          int64
dtype: object


In [18]:
#To apply any function to column
df['New Odometer'] = df['Odometer (KM)'].apply(lambda x:x*2) 
df

Unnamed: 0,Car Company,Colour,Odometer (KM),Doors,Price,Price_fillna,New Doors,New Odometer
0,Nissan,Black,10148.0,4.0,Rs1366850,4.0,4,20296.0
1,Toyota,White,10217.0,4.0,Rs1144150,4.0,4,20434.0
2,Toyota,White,10247.0,4.0,Rs1628300,4.0,4,20494.0
3,Honda,White,10953.0,4.0,Rs831800,4.0,4,21906.0
4,Nissan,Blue,10954.0,4.0,Rs1521950,4.0,4,21908.0
...,...,...,...,...,...,...,...,...
995,Toyota,White,,4.0,Rs1706650,4.0,4,
996,Toyota,White,,4.0,Rs1767900,4.0,4,
997,BMW,White,,5.0,Rs1906400,5.0,5,
998,Toyota,White,,4.0,,4.0,4,


In [19]:
#Data Aggregation and Grouping

#Data Grouping: When we want to perform some operations on the basis of categories(columns), like to find average salary per city, so average salary is operation and city(column is category) groupby

#find the average odometer color wise
grouped_odometer = df.groupby('Colour')['Odometer (KM)'].mean()
print(grouped_odometer)

Colour
Black    132735.744681
Blue     135673.448276
Green    139443.492958
Red      132375.304878
White    126049.368852
Name: Odometer (KM), dtype: float64


In [None]:
#find the sum of odometer based on color and region wise
grouped_odometer = df.groupby(['Car Company', 'Colour'])['Odometer (KM)'].sum()
print(grouped_odometer)

124690576.0


In [21]:
#performing multiple aggregate functions
grouped_odometer = df.groupby(['Car Company', 'Colour'])['Odometer (KM)'].agg(['mean', 'sum', 'count'])
print(grouped_odometer)

                             mean         sum  count
Car Company Colour                                  
BMW         Black   147689.625000   1181517.0      8
            Blue    120226.296296   3246110.0     27
            Green   130104.166667    780625.0      6
            Red     117252.833333   1407034.0     12
            White   129903.264706   4416711.0     34
Honda       Black   140506.241379   4074681.0     29
            Blue    134705.885057  11719412.0     87
            Green   123289.588235   2095923.0     17
            Red     119797.750000   2875146.0     24
            White   117174.039604  11834578.0    101
Nissan      Black   152357.642857   2133007.0     14
            Blue    134397.862745   6854291.0     51
            Green   148961.250000   2383380.0     16
            Red     146948.076923   1910325.0     13
            White   130853.625000   9421461.0     72
Toyota      Black   118822.307692   4634070.0     39
            Blue    145176.981818  15969468.0 

In [22]:
#Merging and joining DataFrames
df1 = pd.DataFrame({'Key':['A','B','C'], 'Value1':[1, 2, 3]})
df2 = pd.DataFrame({'Key':['A','B','D'], 'Value2':[4, 5, 6]})

In [23]:
df1

Unnamed: 0,Key,Value1
0,A,1
1,B,2
2,C,3


In [24]:
df2

Unnamed: 0,Key,Value2
0,A,4
1,B,5
2,D,6


In [25]:
#Merging: It is same like joins in SQL to merge to DataFrame on the basis of inner, outer, left and, right joins 
pd.merge(df1, df2, on="Key", how="inner") #if basically merge the two dataframe on the basis of common key values

Unnamed: 0,Key,Value1,Value2
0,A,1,4
1,B,2,5


In [26]:
pd.merge(df1, df2, on="Key", how="outer") #if basically merge the two dataframe on the basis of all the key values of both the dataframe and which values are not present write na

Unnamed: 0,Key,Value1,Value2
0,A,1.0,4.0
1,B,2.0,5.0
2,C,3.0,
3,D,,6.0


In [27]:
pd.merge(df1, df2, on="Key", how="left") #if basically merge the two dataframe on the basis of all the key values of left dataframe give priority to left df takes it all key values as it is and right df values which present use and not present write na

Unnamed: 0,Key,Value1,Value2
0,A,1,4.0
1,B,2,5.0
2,C,3,


In [28]:
pd.merge(df1, df2, on="Key", how="right") #if basically merge the two dataframe on the basis of all the key values of right dataframe give priority to right df takes it all key values as it is and left df values which present use and not present write na

Unnamed: 0,Key,Value1,Value2
0,A,1.0,4
1,B,2.0,5
2,D,,6
