## Segment 3 - Generating summary statistics using pandas and scipy

In [29]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import scipy
from scipy import stats

In [30]:
from google.colab import files
uploaded = files.upload()

In [31]:
cars = pd.read_csv('mtcars.csv', index_col=0)

#cars.columns = ['car_names','mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb']

cars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


### Looking at summary statistics that decribe a variable's numeric values

In [32]:
cars.sum()

Unnamed: 0,0
mpg,642.9
cyl,198.0
disp,7383.1
hp,4694.0
drat,115.09
wt,102.952
qsec,571.16
vs,14.0
am,13.0
gear,118.0


In [33]:
numerical_cols = cars.columns
cars[numerical_cols].mean()

Unnamed: 0,0
mpg,20.090625
cyl,6.1875
disp,230.721875
hp,146.6875
drat,3.596563
wt,3.21725
qsec,17.84875
vs,0.4375
am,0.40625
gear,3.6875


In [34]:
cars[numerical_cols].median()

Unnamed: 0,0
mpg,19.2
cyl,6.0
disp,196.3
hp,123.0
drat,3.695
wt,3.325
qsec,17.71
vs,0.0
am,0.0
gear,4.0


In [35]:
cars.max()

Unnamed: 0,0
mpg,33.9
cyl,8.0
disp,472.0
hp,335.0
drat,4.93
wt,5.424
qsec,22.9
vs,1.0
am,1.0
gear,5.0


In [36]:
mpg = cars.mpg
mpg.idxmax()


'Toyota Corolla'

In [37]:
mpg.idxmin()

'Cadillac Fleetwood'

# question: find the max and min hoursepower car name from our dataset.

In [38]:
max_hp_car = cars['hp'].idxmax()
max_hp = cars['hp'].max()

min_hp_car = cars['hp'].idxmin()
min_hp = cars['hp'].min()

print(f"Car with MAX horsepower: {max_hp_car} ({max_hp} hp)")
print(f"Car with MIN horsepower: {min_hp_car} ({min_hp} hp)")


Car with MAX horsepower: Maserati Bora (335 hp)
Car with MIN horsepower: Honda Civic (52 hp)


### Looking at summary statistics that describe variable distribution

In [39]:
cars.std()

Unnamed: 0,0
mpg,6.026948
cyl,1.785922
disp,123.938694
hp,68.562868
drat,0.534679
wt,0.978457
qsec,1.786943
vs,0.504016
am,0.498991
gear,0.737804


In [40]:
cars.var()

Unnamed: 0,0
mpg,36.324103
cyl,3.189516
disp,15360.799829
hp,4700.866935
drat,0.285881
wt,0.957379
qsec,3.193166
vs,0.254032
am,0.248992
gear,0.544355


In [41]:
gear = cars.gear # gear= cars['gear']
gear.value_counts()

Unnamed: 0_level_0,count
gear,Unnamed: 1_level_1
3,15
4,12
5,5


In [42]:
cars.describe()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.84875,0.4375,0.40625,3.6875,2.8125
std,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.786943,0.504016,0.498991,0.737804,1.6152
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


For above, 25% is percentile calculated using below formula. res_25 = (value falling at 25% index) + ((value falling one after 25% index)-(value falling at 25% index))*(3/4)

So in our case it would be 15.2 + (15.5 - 15.2) * (3/4) = 15.425

Similarly, it calcuates 50% as below : res_50 = (value falling at 50% index) + ((value falling one after 50% index)-(value falling at 50% index))*(1/2)

and for 75% res_75 = (value falling at 75% index) + ((value falling one after 75% index)-(value falling at 75% index))*(1/4)

