## Machine Learning - Session 4 -  Data Exploration and Data Manipulation

## Part 1 - Data Exploration

In [2]:
# Set the working directory
import os
os.chdir(r"C:\Users\ASUS\Desktop\Python")

#Read the cars.csv data
import numpy as np
import pandas as pd
cars = pd.read_csv("cars.csv")

#### 1. Summaries and Aggregates

In [3]:
#A. The describe() method
cars.describe()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model
count,406.0,406.0,406.0,406.0,406.0,406.0,406.0
mean,23.051232,5.475369,194.779557,103.529557,2979.413793,15.519704,75.921182
std,8.401777,1.71216,104.922458,40.520659,847.004328,2.803359,3.748737
min,0.0,3.0,68.0,0.0,1613.0,8.0,70.0
25%,17.0,4.0,105.0,75.0,2226.5,13.7,73.0
50%,22.35,4.0,151.0,93.5,2822.5,15.5,76.0
75%,29.0,8.0,302.0,129.0,3618.25,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [4]:
cars[["MPG","Weight"]].describe()

Unnamed: 0,MPG,Weight
count,406.0,406.0
mean,23.051232,2979.413793
std,8.401777,847.004328
min,0.0,1613.0
25%,17.0,2226.5
50%,22.35,2822.5
75%,29.0,3618.25
max,46.6,5140.0


In [5]:
#A. Aggregates - groupby() method
cars["Weight"].mean()

2979.4137931034484

In [6]:
cars[["Weight","Origin"]].groupby(["Origin"]).mean()

Unnamed: 0_level_0,Weight
Origin,Unnamed: 1_level_1
Europe,2431.493151
Japan,2221.227848
US,3372.700787


In [7]:
cars[["Weight","Cylinders"]].groupby(["Cylinders"]).median()

Unnamed: 0_level_0,Weight
Cylinders,Unnamed: 1_level_1
3,2375.0
4,2234.0
5,2950.0
6,3201.5
8,4137.5


In [8]:
cars[["Weight","Origin"]].groupby(["Origin"]).describe()

Unnamed: 0_level_0,Weight,Weight,Weight,Weight,Weight,Weight,Weight,Weight
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Origin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Europe,73.0,2431.493151,490.883617,1825.0,2065.0,2246.0,2800.0,3820.0
Japan,79.0,2221.227848,320.497248,1613.0,1985.0,2155.0,2412.5,2930.0
US,254.0,3372.700787,791.695866,1800.0,2721.25,3380.5,4054.75,5140.0


#### 2. Tables (Crosstabs)

In [9]:
#A. Univariate Frequency distribution of a categorical variable (say, Origin)
tab = pd.crosstab(cars.Origin, columns="counts")
tab

col_0,counts
Origin,Unnamed: 1_level_1
Europe,73
Japan,79
US,254


In [10]:
list(tab)

['counts']

In [11]:
type(tab)

pandas.core.frame.DataFrame

In [12]:
tab.sum()

col_0
counts    406
dtype: int64

In [13]:
#B. Univariate table of proportion
tab/tab.sum()*100

col_0,counts
Origin,Unnamed: 1_level_1
Europe,17.980296
Japan,19.458128
US,62.561576


In [14]:
#B. Univariate table of proportion
round(tab/tab.sum()*100,2)

TypeError: a float is required

In [None]:
#C. Bi-variate Frequency Distribution Table
pd.crosstab(cars.Origin,columns=cars.Cylinders)

In [None]:
#Re-Naming the Rows and Columns
table = pd.crosstab(cars.Origin, columns=cars.Cylinders)
table.columns = ["Cyl3","Cyl4","Cyl5","Cyl6","Cyl8"]
table.index = ["EU","JP","US"]
table

In [None]:
table = pd.crosstab(cars.Origin, columns=cars.Cylinders)
table.columns
table.index

In [None]:
#Finding the col-sums
coltotal = table.sum(axis=0)
coltotal

In [None]:
#Finding the row sums
rowtotal = table.sum(axis=1)
rowtotal

In [None]:
#D. Table of joint proportion
round(table/sum(rowtotal),2)*100

In [None]:
#OR
table/sum(coltotal)

In [None]:
#E. Table of conditional proportion (Conditioned on rows) [P(Cyl|Origin)]
#--> Out of all the cars originated in [o], [q]% of the cars have [x] cylinders.
table.div(rowtotal, axis=0) #(Why?)

In [None]:
#F. Table of conditional proportion (conditioned on columns) [P(Origin|Cyl)]
#--> Out of all the cars having [x] cylinders, [p]% of them originate in [Origin]
table.div(coltotal, axis=1)

#### 3. Graphs

In [None]:
import matplotlib.pyplot as plt

#### 3A. Histograms

In [None]:
#histogram of MPG
cars.hist(column="MPG")
plt.show()

In [None]:
#Additional arguments: 
cars.hist(column="MPG", 
          grid=False,         #Do not include the grid
          figsize = (6,4),    #(length ,  height) of the plot in inches
         bins=10)    

plt.show()

*Documentation: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.hist.html*

In [None]:
#Labelling the axes and giving the title
cars.hist(column="MPG", 
          grid=False,         #Do not include the grid
          figsize = (6,4))    #(length ,  height) of the plot in inches

plt.xlabel("Miles per Gallon", fontsize=12)
plt.ylabel("Number of Cars", fontsize=12)
plt.title("Histogram of MPG", fontsize=16)


plt.show()

In [None]:
#Editing color and edgecolor
cars.hist(column="MPG", 
          grid=False,         
          figsize = (6,4),
          color = "lightgreen",
          edgecolor = "white")  

plt.xlabel("Miles per Gallon", fontsize=12)
plt.ylabel("Number of Cars", fontsize=12)
plt.title("Histogram of MPG", fontsize=16)


plt.show()

In [None]:
#Breaking up by the categories of a categorical variable
cars.hist(column="MPG", 
          by = "Origin",           #Note this argument
          grid=False,         
          figsize = (8,6),
          color = "lightgreen",
          edgecolor = "white")  

plt.xlabel("Miles per Gallon", fontsize=12)
plt.ylabel("Number of Cars", fontsize=12)
plt.title("Histogram of MPG", fontsize=16)


plt.show()

#### 3B. Boxplots

In [None]:
#Plotting a boxplot for the variable MPG
cars.boxplot(column="MPG")
plt.show()

In [None]:
#Additional arguments (horizontal boxplot)
cars.boxplot(column="MPG", 
             grid=False,         
             figsize = (6,4),
             vert = False)    #To plot a horizontal boxplot

plt.show()

*Documentation: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.boxplot.html*

In [None]:
#Labelling the boxplot
cars.boxplot(column="MPG", notch=True, #To get the notch
             grid=False,         
             figsize = (8,6),
             vert = False)    #To plot a horizontal boxplot

plt.xlabel("Miles per Gallon", fontsize=12)
plt.title("Boxplot of MPG", fontsize=14)

plt.show()

In [None]:
#Side by Side boxplot
cars.boxplot(column="Weight", notch = True,
             by="Origin",
             grid=False,         
             figsize = (6,4))

plt.ylabel("Weight of Car", fontsize=12)
plt.title("Boxplot of Weight", fontsize=14)

plt.show()

In [None]:
#The rotation argument
#Side by Side boxplot
cars.boxplot(column="Weight", 
             by="Origin",
             grid=False,         
             figsize = (6,4),
             rot=90)            #The angle by which you want to rotate the labels

plt.xlabel("Miles per Gallon", fontsize=12)
plt.title("Boxplot of MPG", fontsize=14)

plt.show()


#### 3C. Bar Plots

In [15]:
cars['Origin'].value_counts()

US        254
Japan      79
Europe     73
Name: Origin, dtype: int64

In [None]:
type(cars['Origin'].value_counts())

In [None]:
cars['Origin'].value_counts().plot(kind='bar', figsize=(4,4),
                                        color="coral", fontsize=13)

plt.show()

In [None]:
cars.Origin.value_counts().plot(kind='barh', figsize=(4,2),
                                        color="coral", fontsize=13)

plt.show()

Study pandas.DataFrame.plot: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html 

#### 3D. Scatter Plots

In [None]:
#Scatter plot 
cars.plot(kind="scatter",           #The type of plot to be plotted
          x="Weight",y="MPG",       #The variables in x and y axis
          color="black",            #Colour of the dots
          figsize=(5,5))            #Figure size


plt.xlabel("Weight of cars", fontsize=12)
plt.ylabel("MPG of cars", fontsize=12)
plt.title("Scatter Plot", fontsize=14)

plt.show()

*Documentation: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html*

In [None]:
#Pairwise Scatterplot
from pandas.plotting import scatter_matrix
scatter_matrix(cars.drop(["Car","Model","Origin"],axis=1), alpha=0.5, figsize=(14, 10), 
               diagonal='hist')
plt.show()


In [None]:
#Exporting Graph

from pandas.plotting import scatter_matrix
scatter_matrix(cars.drop(["Car","Origin"],axis=1), alpha=0.75, figsize=(14, 10), diagonal='kde')
plt.savefig('scatter.png')

## Part 2 - Data Manipulation

#### 1. Sorting

In [None]:
car

In [None]:
#Sort the data in ascending order of MPG
cars.sort_values("MPG").head()

In [None]:
#Sort the data in descending order of MPG
cars.sort_values("MPG", ascending=0).head()

In [None]:
#Sort the data in descending order of Origin and then by increasing order of MPG
cars.sort_values(["Origin","MPG"], ascending=[0,1])

In [None]:
#Saving in another data frame
cars2 = cars.sort_values(["Origin","MPG"], ascending=[0,1])
cars2

In [None]:
#Exporting Data


#### 2. Variable Transformation

In [None]:
# Plot a scatter plot between MPG and weight. Comment of the linearity.
plt.scatter(cars.Weight, y=cars.MPG, color="black")
plt.xlabel("Weight")
plt.ylabel("MPG")
plt.show()

In [None]:
# Re-plot the scatter plot by taking the log transformation of both the variables. Does the linearity 
# Improve?
import numpy as np

plt.scatter(np.log(cars.Weight), y=np.log(cars.MPG), color="black")
plt.xlabel("log_Weight")
plt.ylabel("log_MPG")
plt.show()

In [None]:
np.log(cars.MPG).describe()

In [None]:
# But there is a problem! Check the summary of log(MPG). What do you observe?
cars.MPG.describe()

In [None]:
np.log(cars.MPG).describe()

In [None]:
# Can you identify where we got misleaded? Probably the answer lies in the summary of MPG.
# How can we correct this?
np.log(cars.MPG+1).describe()

In [None]:
# Study the association between MPG and Horsepower. (Do Yourself)



#### The np.where() function


In [None]:
import numpy as np

In [None]:
# PROBLEM 1:
# Create a variable HP which will take only two values:
# IF Horsepower < 100 THEN "Low HP"
# IF Horsepower >= 100 THEN "High HP"

#np.where(condition, value if true, value if fase)

cars["HP"] = np.where(cars.Horsepower >= 100, "High", "Low")
cars.tail(10)

In [None]:
#np.where(condition1, value if true, condition 2, value if true, condition 3, value if true, value if flase)

In [None]:
#PROBLEM 2:
# Create a Variable MPG_Rate which will take on the values as follows:
# IF MPG < 15 THEN "Normal"
# IF MPG >= 15 AND MPG <=25 THEN "GOOD"
# IF MPG >= 25 AND MPG <=35 THEN "GREAT"
# IF MPG >= 35 THEN "AWESOME"

cars["MPG_Rate"] = np.where(cars.MPG<15,"Normal",cars.MPG>=15,"Good")
cars.tail(10)
