## Segment 3 - Generating summary statistics using pandas and scipy

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import scipy
from scipy import stats

In [30]:
# from google.colab import files
# uploaded = files.upload()

In [2]:
cars = pd.read_csv('mtcars.csv', index_col=0)

#cars.columns = ['car_names','mpg','cyl','disp','hp','drat','wt','qsec','vs','am','gear','carb']

cars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


### Looking at summary statistics that decribe a variable's numeric values

In [3]:
cars.sum()

mpg      642.900
cyl      198.000
disp    7383.100
hp      4694.000
drat     115.090
wt       102.952
qsec     571.160
vs        14.000
am        13.000
gear     118.000
carb      90.000
dtype: float64

In [4]:
# 通过columns属性获取所有的列名
numerical_cols = cars.columns
# 获取所有列的mean值
cars[numerical_cols].mean()

mpg      20.090625
cyl       6.187500
disp    230.721875
hp      146.687500
drat      3.596563
wt        3.217250
qsec     17.848750
vs        0.437500
am        0.406250
gear      3.687500
carb      2.812500
dtype: float64

In [5]:
cars[numerical_cols].median()

mpg      19.200
cyl       6.000
disp    196.300
hp      123.000
drat      3.695
wt        3.325
qsec     17.710
vs        0.000
am        0.000
gear      4.000
carb      2.000
dtype: float64

In [6]:
cars.max()

mpg      33.900
cyl       8.000
disp    472.000
hp      335.000
drat      4.930
wt        5.424
qsec     22.900
vs        1.000
am        1.000
gear      5.000
carb      8.000
dtype: float64

In [9]:
# 获取最大值
mpg = cars.mpg
mpg.max()

33.9

In [10]:
# 获取最大值所在的索引
mpg.idxmax()

'Toyota Corolla'

In [11]:
# 获取最大值的索引位置
mpg.argmax()

np.int64(19)

In [12]:
# 获取最小值所在的索引
mpg.idxmin()

'Cadillac Fleetwood'

# question: find the max and min hoursepower car name from our dataset.

In [14]:
# max_hp_car = cars['hp'].idxmax()
# max_hp = cars['hp'].max()
#
# min_hp_car = cars['hp'].idxmin()
# min_hp = cars['hp'].min()
#
# print(f"Car with MAX horsepower: {max_hp_car} ({max_hp} hp)")
# print(f"Car with MIN horsepower: {min_hp_car} ({min_hp} hp)")

hoursepower = cars.hp
max_hoursepower = hoursepower.max()
max_hp_index = hoursepower.idxmax()
max_hp_index_name = hoursepower.argmax()

min_hoursepower = hoursepower.min()
min_hp_index = hoursepower.idxmin()
min_hp_index_name = hoursepower.argmin()

print(f"Car with MAX hoursepower: {max_hoursepower} is at index {max_hp_index}, name: {max_hp_index_name}")
print(f"Car with MIN hoursepower: {min_hoursepower} is at index {min_hp_index}, name: {min_hp_index_name}")


Car with MAX hoursepower: 335 is at index Maserati Bora, name: 30
Car with MIN hoursepower: 52 is at index Honda Civic, name: 18


### Looking at summary statistics that describe variable distribution

In [15]:
cars.std()

mpg       6.026948
cyl       1.785922
disp    123.938694
hp       68.562868
drat      0.534679
wt        0.978457
qsec      1.786943
vs        0.504016
am        0.498991
gear      0.737804
carb      1.615200
dtype: float64

In [16]:
cars.var()

mpg        36.324103
cyl         3.189516
disp    15360.799829
hp       4700.866935
drat        0.285881
wt          0.957379
qsec        3.193166
vs          0.254032
am          0.248992
gear        0.544355
carb        2.608871
dtype: float64

In [17]:
gear = cars.gear # gear= cars['gear']
gear.value_counts()

gear
3    15
4    12
5     5
Name: count, dtype: int64

In [18]:
cars.describe()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.84875,0.4375,0.40625,3.6875,2.8125
std,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.786943,0.504016,0.498991,0.737804,1.6152
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


For above, 25% is percentile calculated using below formula. res_25 = (value falling at 25% index) + ((value falling one after 25% index)-(value falling at 25% index))*(3/4)

So in our case it would be 15.2 + (15.5 - 15.2) * (3/4) = 15.425

Similarly, it calcuates 50% as below : res_50 = (value falling at 50% index) + ((value falling one after 50% index)-(value falling at 50% index))*(1/2)

and for 75% res_75 = (value falling at 75% index) + ((value falling one after 75% index)-(value falling at 75% index))*(1/4)

