In [1]:
!pip install plotly
!pip install squarify



In [2]:
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import seaborn as sns

import plotly
import plotly.offline as pyoff
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go

import squarify # for tree maps
%matplotlib notebook

init_notebook_mode(connected=True)

In [3]:
hr_data = pd.read_csv("ToyotaCorolla.csv")

In [4]:
hr_data.head()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23,46986,Diesel,90,1,0,2000,3,1165
1,13750,23,72937,Diesel,90,1,0,2000,3,1165
2,13950,24,41711,Diesel,90,1,0,2000,3,1165
3,14950,26,48000,Diesel,90,0,0,2000,3,1165
4,13750,30,38500,Diesel,90,0,0,2000,3,1170


In [5]:
hr_data.shape

(1436, 10)

In [6]:
hr_data.columns

Index(['Price', 'Age', 'KM', 'FuelType', 'HP', 'MetColor', 'Automatic', 'CC',
       'Doors', 'Weight'],
      dtype='object')

In [7]:
hr_data.dtypes

Price         int64
Age           int64
KM            int64
FuelType     object
HP            int64
MetColor      int64
Automatic     int64
CC            int64
Doors         int64
Weight        int64
dtype: object

In [8]:
hr_data.isnull().sum()

Price        0
Age          0
KM           0
FuelType     0
HP           0
MetColor     0
Automatic    0
CC           0
Doors        0
Weight       0
dtype: int64

In [9]:
hr_data['FuelType'].nunique()

3

In [10]:
hr_data['FuelType'].unique()

array(['Diesel', 'Petrol', 'CNG'], dtype=object)

In [11]:
hr_data['MetColor'].unique()

array([1, 0], dtype=int64)

In [12]:
for i in hr_data.columns:
    print ("Number of unique values in {} column are {} \n The unique values are {}".format(i, len(hr_data[i].unique()),hr_data[i].unique()))
    print ("---------------------- \n")

Number of unique values in Price column are 236 
 The unique values are [13500 13750 13950 14950 12950 16900 18600 21500 20950 19950 19600 22500
 22000 22750 17950 16750 16950 15950 16250 17495 15750 15500 14750 19000
 15800 21950 20500 13250 15250 18950 15999 16500 18750 22250 12995 18450
 16895 14900 17250 15450 16650 17450 16450 18900 18990 18500 19450 18800
 32500 31000 31275 24950 22950 24990 17900 19250 16350 21750 15850 23000
 19900 23950 24500 17200 19500 16868 19750 20750 17650 17795 18245 23750
 18700 21125  6950  9500 11950  7750  4350  4750 11750 11900  9950 11495
 11250 10500 10450 11500 12500 10950 11450 11790 12450 11690 12750 11925
 12900 11650 10850  9940 13450 12495 12000 11480 14990 12850 11700 11895
 13875 12295 13995  9900 11990 10750 11695 11000 12400 12200 12695 14350
 10250  6500  6400  7000  8900  8500  8950  9250  9450  8250  4450  9000
  5150  7900 10900  9750 11290 10895 10995  9850  8695 10990  8750  9930
  9799  9700  9990  9475 10000 10495  9400  9650  95

In [13]:
for i in hr_data.columns:
    print(i)
    print(hr_data[i].nunique())
    print(hr_data[i].unique())

Price
236
[13500 13750 13950 14950 12950 16900 18600 21500 20950 19950 19600 22500
 22000 22750 17950 16750 16950 15950 16250 17495 15750 15500 14750 19000
 15800 21950 20500 13250 15250 18950 15999 16500 18750 22250 12995 18450
 16895 14900 17250 15450 16650 17450 16450 18900 18990 18500 19450 18800
 32500 31000 31275 24950 22950 24990 17900 19250 16350 21750 15850 23000
 19900 23950 24500 17200 19500 16868 19750 20750 17650 17795 18245 23750
 18700 21125  6950  9500 11950  7750  4350  4750 11750 11900  9950 11495
 11250 10500 10450 11500 12500 10950 11450 11790 12450 11690 12750 11925
 12900 11650 10850  9940 13450 12495 12000 11480 14990 12850 11700 11895
 13875 12295 13995  9900 11990 10750 11695 11000 12400 12200 12695 14350
 10250  6500  6400  7000  8900  8500  8950  9250  9450  8250  4450  9000
  5150  7900 10900  9750 11290 10895 10995  9850  8695 10990  8750  9930
  9799  9700  9990  9475 10000 10495  9400  9650  9550 13000 11710  9980
 12250 11930 10800 10600  7500  5950  690

In [14]:
#Columns with 15 or less levels are considered as categorical columns for the purpose of this analysis
cat_cols=[]
for i in hr_data.columns:
    if hr_data[i].dtype =='object' or len(np.unique(hr_data[i]))<=15 : # if the number of levels is less that 15 considering the column as categorial
        cat_cols.append(i)
        print("{} : {} : {} ".format(i,len(np.unique(hr_data[i])),np.unique(hr_data[i])))

FuelType : 3 : ['CNG' 'Diesel' 'Petrol'] 
HP : 12 : [ 69  71  72  73  86  90  97  98 107 110 116 192] 
MetColor : 2 : [0 1] 
Automatic : 2 : [0 1] 
CC : 12 : [1300 1332 1398 1400 1587 1598 1600 1800 1900 1975 1995 2000] 
Doors : 4 : [2 3 4 5] 


In [15]:
cat_cols

['FuelType', 'HP', 'MetColor', 'Automatic', 'CC', 'Doors']

In [16]:
hr_data[cat_cols].dtypes

FuelType     object
HP            int64
MetColor      int64
Automatic     int64
CC            int64
Doors         int64
dtype: object

In [17]:
for i in cat_cols:
    hr_data[i] = hr_data[i].astype('category')

In [18]:
hr_data[cat_cols].dtypes

FuelType     category
HP           category
MetColor     category
Automatic    category
CC           category
Doors        category
dtype: object

In [19]:
num_cols = [i for i in hr_data.columns if i not in cat_cols]

In [20]:
num_cols

['Price', 'Age', 'KM', 'Weight']

In [21]:
hr_data.head()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23,46986,Diesel,90,1,0,2000,3,1165
1,13750,23,72937,Diesel,90,1,0,2000,3,1165
2,13950,24,41711,Diesel,90,1,0,2000,3,1165
3,14950,26,48000,Diesel,90,0,0,2000,3,1165
4,13750,30,38500,Diesel,90,0,0,2000,3,1170


In [22]:
#Exploratory Data Analysis

In [23]:
hr_data.shape

(1436, 10)

In [24]:
#(rows,column)

In [25]:
hr_data.MetColor

0       1
1       1
2       1
3       0
4       0
       ..
1431    1
1432    0
1433    0
1434    1
1435    0
Name: MetColor, Length: 1436, dtype: category
Categories (2, int64): [0, 1]

In [26]:
temp = hr_data.MetColor.value_counts()

In [27]:
temp.index

CategoricalIndex([1, 0], categories=[0, 1], ordered=False, dtype='category')

In [28]:
temp.values

array([969, 467], dtype=int64)

In [29]:
#Metcolor_dict = {0:'No',1:'Yes'}
#Metcolor_dict.items()
#l1 = list(Metcolor_dict.items())
#l1

In [30]:
#Automatic_dict = {0:'No',1:'Yes'}
#Automatic_dict.items()
#l2 = list(Automatic_dict.items())
#l2

In [31]:
#data = data.replace({"MetColor":l1, 
 #                  "Automatic": l2})

In [32]:
hr_data['FuelType'].value_counts()

Petrol    1264
Diesel     155
CNG         17
Name: FuelType, dtype: int64

In [33]:
trace = go.Bar(x=temp.index,
               y= temp.values,
               text = temp.values,
               textposition = 'inside',
               name = 'Automatic')
data = [trace]
layout = go.Layout(
    autosize=False,
    width=600,
    height=400,title = "Automatic "
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [34]:
temp.values.sum()

1436

In [35]:
trace = go.Bar(x=temp.index,
               y= np.round(temp.values/temp.values.sum(),2),
               text = np.round(temp.values/temp.values.sum(),2),
               textposition = 'inside',
               name = 'MetColor')
data = [trace]
layout = go.Layout(
    autosize=False,
    width=400,
    height=400,title = "MetColor"
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [36]:
def generate_layout_bar(col_name):
    layout_bar = go.Layout(
        autosize=False, # auto size the graph? use False if you are specifying the height and width
        width=800, # height of the figure in pixels
        height=600, # height of the figure in pixels
        title = "Distribution of {} column".format(col_name), # title of the figure
        # more granular control on the title font 
        titlefont=dict( 
            family='Courier New, monospace', # font family
            size=14, # size of the font
            color='black' # color of the font
        ),
        # granular control on the axes objects 
        xaxis=dict( 
        tickfont=dict(
            family='Courier New, monospace', # font family
            size=14, # size of ticks displayed on the x axis
            color='black'  # color of the font
            )
        ),
        yaxis=dict(
            title='Percentage',
            titlefont=dict(
                size=14,
                color='black'
            ),
        tickfont=dict(
            family='Courier New, monospace', # font family
            size=14, # size of ticks displayed on the y axis
            color='black' # color of the font
            )
        ),
        font = dict(
            family='Courier New, monospace', # font family
            color = "white",# color of the font
            size = 12 # size of the font displayed on the bar
                )  
        )
    return layout_bar

In [37]:
def plot_bar(col_name):
    # create a table with value counts
    temp = hr_data[col_name].value_counts()
    # creating a Bar chart object of plotly
    data = [go.Bar(
            x=temp.index, # x axis values
            y=np.round(temp.values/temp.values.sum(),4)*100, # y axis values
            text = ['{}%'.format(i) for i in np.round(temp.values/temp.values.sum(),4)*100],
        # text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
            textposition = 'auto', # specify at which position on the bar the text should appear
        marker = dict(color = '#0047AB'),)] # change color of the bar
    # color used here Cobalt Blue
     
    layout_bar = generate_layout_bar(col_name=col_name)

    fig = go.Figure(data=data, layout=layout_bar)
    return iplot(fig)
    

In [38]:
plot_bar('Weight')

In [39]:
plot_bar('MetColor')

In [40]:
plot_bar('HP')

In [41]:
plot_bar('Age')

In [42]:
plot_bar('Doors')

In [43]:
plot_bar('CC')

In [44]:
data = [go.Histogram(x=hr_data.Age,
       marker=dict(
        color='#CC0E1D',# Lava (#CC0E1D)
#         color = 'rgb(200,0,0)'   `
    ))]
layout = go.Layout(title = "Histogram of Age")
fig = go.Figure(data= data, layout=layout)
iplot(fig)

In [45]:
data = [go.Histogram(x=hr_data.Price,
       marker=dict(
        color='#CC0E1D',# Lava (#CC0E1D)
#         color = 'rgb(200,0,0)'   
    ))]
layout = go.Layout(title = "Histogram of Price")
fig = go.Figure(data= data, layout=layout)
iplot(fig)

In [46]:
data = [go.Histogram(x=hr_data.Weight,
       marker=dict(
        color='#CC0E1D',# Lava (#CC0E1D)
#         color = 'rgb(200,0,0)'   
    ))]
layout = go.Layout(title = "Histogram of Weight")
fig = go.Figure(data= data, layout=layout)
iplot(fig)

In [47]:
def generate_layout_bar(col_name):
    layout_bar = go.Layout(
        autosize=False, # auto size the graph? use False if you are specifying the height and width
        width=800, # height of the figure in pixels
        height=600, # height of the figure in pixels
        title = "Distribution of {} column".format(col_name), # title of the figure
        # more granular control on the title font 
        titlefont=dict( 
            family='Courier New, monospace', # font family
            size=14, # size of the font
            color='black' # color of the font
        ),
        # granular control on the axes objects 
        xaxis=dict( 
        tickfont=dict(
            family='Courier New, monospace', # font family
            size=14, # size of ticks displayed on the x axis
            color='black'  # color of the font
            )
        ),
        yaxis=dict(
            title='Percentage',
            titlefont=dict(
                size=14,
                color='black'
            ),
        tickfont=dict(
            family='Courier New, monospace', # font family
            size=14, # size of ticks displayed on the y axis
            color='black' # color of the font
            )
        ),
        font = dict(
            family='Courier New, monospace', # font family
            color = "white",# color of the font
            size = 12 # size of the font displayed on the bar
                )  
        )
    return layout_bar

In [48]:
hr_data.Price[hr_data.Automatic== 1].describe()

count       80.000000
mean     11224.625000
std       3636.914708
min       6500.000000
25%       8950.000000
50%       9950.000000
75%      11950.000000
max      20950.000000
Name: Price, dtype: float64

In [49]:
hr_data.Price[hr_data.Automatic== 0].describe()

count     1356.000000
mean     10701.691740
std       3625.620064
min       4350.000000
25%       8437.500000
50%       9900.000000
75%      11950.000000
max      32500.000000
Name: Price, dtype: float64

In [50]:
trace1 = go.Box(y = hr_data.Price[hr_data.Automatic== 1],name = "Yes",
#                 boxpoints = 'all',jitter = 1
               )
# boxpoints is used to specify the points to plot
# jitter is used to specify how far from each should the points be
trace2 = go.Box(y = hr_data.Price[hr_data.Automatic== 0],name= "No",
#                boxpoints = 'all',jitter = 1
               )
data = [trace1,trace2]
layout = go.Layout(width = 800,
                   height = 500,title = 'Price & Automation')
fig = go.Figure(data=data,layout = layout)
iplot(fig)

In [51]:
yearscurrman_jobsat = hr_data.groupby(['Weight','KM']).size().to_frame()
yearscurrman_jobsat = yearscurrman_jobsat.reset_index()
yearscurrman_jobsat.columns = ['Weight','KM','FuelType']

In [52]:
np.random.seed(0)
yearscurrman_jobsat.sample(frac =0.1)

Unnamed: 0,Weight,KM,FuelType
317,1035,95000,1
587,1055,66880,1
1156,1110,13747,1
610,1060,22648,1
1239,1119,98823,1
...,...,...,...
19,1010,104000,1
1056,1085,100458,1
411,1050,36923,1
1263,1130,5459,1


In [53]:
num_cols

['Price', 'Age', 'KM', 'Weight']

In [54]:
hr_data[['Price','Weight']].corr()

Unnamed: 0,Price,Weight
Price,1.0,0.581198
Weight,0.581198,1.0


In [55]:
trace = go.Scatter(x=hr_data.Weight ,
                  y= hr_data.Price,
                  name = 'Weight & Price',
                  mode= 'markers')
data = [trace]
layout = go.Layout(title = ' Weight & Price',
                  xaxis = dict(title = 'Weight'),
                  yaxis = dict(title = 'Price'))
fig = go.Figure(data=data,layout=layout)
iplot(fig)

In [56]:
trace = go.Scatter(x=hr_data.Age ,
                  y= hr_data.Price,
                  name = 'Age & Price',
                  mode= 'markers')
data = [trace]
layout = go.Layout(title = ' Weight & Price',
                  xaxis = dict(title = 'Age'),
                  yaxis = dict(title = 'Price'))
fig = go.Figure(data=data,layout=layout)
iplot(fig)

In [57]:
trace = go.Scatter(x=hr_data.FuelType ,
                  y= hr_data.Price,
                  name = 'Fuel Type & Price',
                  mode= 'markers')
data = [trace]
layout = go.Layout(title = ' Fuel Type & Price',
                  xaxis = dict(title = 'FuelType'),
                  yaxis = dict(title = 'Price'))
fig = go.Figure(data=data,layout=layout)
iplot(fig)

In [58]:
hr_data.KM.value_counts()

36000     9
1         8
43000     7
59000     7
75000     6
         ..
243000    1
33113     1
194765    1
48738     1
26624     1
Name: KM, Length: 1263, dtype: int64

In [59]:
data = []
for i in hr_data.KM.unique():
    data.append(i)

In [60]:
data

[46986,
 72937,
 41711,
 48000,
 38500,
 61000,
 94612,
 75889,
 19700,
 71138,
 31461,
 43610,
 32189,
 23000,
 34131,
 18739,
 34000,
 21716,
 25563,
 64359,
 67660,
 43905,
 56349,
 32220,
 25813,
 28450,
 34545,
 41415,
 44142,
 11090,
 9750,
 35199,
 29510,
 32692,
 41000,
 43000,
 25000,
 10000,
 25329,
 27500,
 49059,
 44068,
 46961,
 110404,
 100250,
 84000,
 79375,
 75048,
 72215,
 64982,
 62636,
 57086,
 56000,
 49866,
 49163,
 45725,
 43210,
 39704,
 38950,
 37400,
 37177,
 36544,
 33511,
 32809,
 32181,
 30993,
 30400,
 30000,
 29719,
 29206,
 29198,
 28817,
 28227,
 28000,
 25266,
 23489,
 22575,
 22000,
 20019,
 20000,
 17003,
 16238,
 15414,
 8537,
 7000,
 66966,
 51884,
 50005,
 48110,
 37500,
 34472,
 33329,
 31850,
 30351,
 29435,
 25948,
 24500,
 23902,
 23175,
 19200,
 18000,
 16123,
 14635,
 13748,
 11500,
 7187,
 1,
 4000,
 1500,
 13253,
 6000,
 10841,
 63000,
 57313,
 57037,
 51099,
 40010,
 39115,
 36012,
 36000,
 35000,
 33740,
 33477,
 32627,
 29797,
 29441,
 

In [61]:
data = []
for i in np.sort(hr_data.MetColor.unique()):
        data.append(go.Box(y = hr_data.Price[hr_data.Automatic==i][hr_data.MetColor==1],
                           marker = dict(
        color = '#CC0E1D',
    ),
                           name = "{}- Yes".format(str(i))))
        data.append(go.Box(y = hr_data.Price[hr_data.Automatic==i][hr_data.MetColor==0],
                           marker = dict(
        color = '#588061',
    ),
                           name = "{}- No".format(str(i))))

layout = go.Layout(
autosize=False, # auto size the graph? use False if you are specifying the height and width
width=1000, # height of the figure in pixels
height=600, # height of the figure in pixels
title = "Boxplot of {} column based on {} ".format('Automatic','MetColor'), # title of the figure
# more granular control on the title font 
    titlefont=dict( 
        family='Courier New, monospace', # font family
        size=14, # size of the font
        color='black' # color of the font
    ),
    # granular control on the axes objects 
    xaxis=dict( 
        title='MetColor ',
    tickfont=dict(
        family='Courier New, monospace', # font family
        size=10, # size of ticks displayed on the x axis
        color='black'  # color of the font
    )
),
yaxis=dict(
#         range=[0,100],
    title='Automatic',
    titlefont=dict(
        size=14,
        color='black'
    ),
    tickfont=dict(
        family='Courier New, monospace', # font family
        size=14, # size of ticks displayed on the y axis
        color='black' # color of the font
    )
),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)        

In [62]:
n = 1500
# Extracting th x, y ,z  values 
temp = hr_data.iloc[0:n,]
temp.shape

(1436, 10)

In [63]:
trace1 = go.Scatter3d(
    x=temp.Price[temp.MetColor==1],
    y=temp.KM[temp.HP==90],
    z=temp.CC[temp.Doors==3],
    mode='markers',name = "Yes",
    marker=dict(
        #size=temp.Experience[temp.Online==1]+2,
        color='#CC0E1D', # ferarri red
#         colorscale='Viridis',   # choose a colorscale
        opacity=1
    )
)

trace2 = go.Scatter3d(
    x=temp.Price[temp.MetColor==1],
    y=temp.KM[temp.HP==90],
    z=temp.CC[temp.Doors==3],
    mode='markers',name ="No",
    marker=dict(
        #size=temp.YearsInCurrentRole[temp.CreditCard==0]+2,
        color='rgb(0,255,0)', #green
#         colorscale='Viridis',   # choose a colorscale
        opacity=0.9,

    )
)
data = [trace1,trace2]
layout = go.Layout(
                    scene = dict(
                    xaxis = dict(
                        title='MetColor',
                        backgroundcolor="black",
                        showbackground=True,
                        titlefont=dict(
                                        size=16,
                                        color='black'
                                        )
                    ),
                    yaxis = dict(
                        title='HP',
                        showbackground=True,
                        backgroundcolor="black",
                        titlefont=dict(
                                        size=16,
                                        color='black'
                                        )
                    ),
                    zaxis = dict(
                        title='Doors',
                        backgroundcolor="black",
                        showbackground=True,
                        titlefont=dict(
                                        size=16,
                                        color='black'
                                        )
                    )
                    ),
                        width=1000, # height of the figure in pixels
                        height=800, # height of the figure in pixels
                  )
fig = go.Figure(data=data, layout=layout)
fig['layout'].update(title= "HP, MetColor, Doors")
iplot(fig, filename='3d-scatter-colorscale')
