In [83]:
import pandas as pd
import numpy as np

In [84]:
auto_mpg_df = pd.read_csv('autompg.csv')
# split the data frame based on country of origin
auto_mpg_us_df = auto_mpg_df[auto_mpg_df['origin'] == 1]
auto_mpg_japan_df = auto_mpg_df[auto_mpg_df['origin'] == 3]
auto_mpg_germany_df = auto_mpg_df[auto_mpg_df['origin'] == 2]

In [85]:
# split based on number of cylinders
# for US cars
auto_mpg_us_4_cylinders_df = auto_mpg_us_df[auto_mpg_us_df['cylinders'] == 4]
auto_mpg_us_6_cylinders_df = auto_mpg_us_df[auto_mpg_us_df['cylinders'] == 6]
# for Japanese cars
auto_mpg_japan_4_cylinders_df = auto_mpg_japan_df[auto_mpg_japan_df['cylinders'] == 4]
auto_mpg_japan_6_cylinders_df = auto_mpg_japan_df[auto_mpg_japan_df['cylinders'] == 6]
# for German cars
auto_mpg_germany_4_cylinders_df = auto_mpg_germany_df[auto_mpg_germany_df['cylinders'] == 4]
auto_mpg_germany_6_cylinders_df = auto_mpg_germany_df[auto_mpg_germany_df['cylinders'] == 6]

We know that Japanese cars are more fuel efficient than US and German cars now. However this data is for cars between the years of 1970 and 1982.
Let's examine if the data shows any statistical significance.

##Test Case I
###For 4 cylinder cars 
####Null hypothesis:
Mean_mpg_Japanese - Mean_mpg_German > 0

and 

Mean_mpg_German - Mean_mpg_US > 0
####Alternative hypothesis
Mean_mpg_Japanese - Mean_mpg_German <= 0

and

Mean_mpg_German - Mean_mpg_US <= 0

In [86]:
mean_mpg_Japanese_4_cyl = auto_mpg_japan_4_cylinders_df['mpg'].mean()
std_mpg_Japanese_4_cyl = auto_mpg_japan_4_cylinders_df['mpg'].std()
print mean_mpg_Japanese_4_cyl, std_mpg_Japanese_4_cyl

31.5956521739 5.43578650575


In [87]:
mean_mpg_German_4_cyl = auto_mpg_germany_4_cylinders_df['mpg'].mean()
std_mpg_German_4_cyl = auto_mpg_germany_4_cylinders_df['mpg'].std()
print mean_mpg_German_4_cyl, std_mpg_German_4_cyl

28.106557377 6.29107486451


In [88]:
mean_mpg_US_4_cyl = auto_mpg_us_4_cylinders_df['mpg'].mean()
std_mpg_US_4_cyl = auto_mpg_us_4_cylinders_df['mpg'].std()
print mean_mpg_US_4_cyl, std_mpg_US_4_cyl

28.0130434783 4.56659588249


Parameter of interest = mean1 - mean2

So SE = sqrt(SE1 x SE1/N1 + SE2 x SE2/N2)

In [89]:
se_japan_german = np.sqrt(std_mpg_Japanese_4_cyl**2/len(auto_mpg_japan_4_cylinders_df) + std_mpg_German_4_cyl**2/len(auto_mpg_germany_4_cylinders_df))
se_japan_german

1.0378063948823995

So for a 95% confidence interval of the mean

In [90]:
diff_japan_german_upper = (mean_mpg_Japanese_4_cyl - mean_mpg_German_4_cyl) + 1.96 * se_japan_german
diff_japan_german_lower = (mean_mpg_Japanese_4_cyl - mean_mpg_German_4_cyl) - 1.96 * se_japan_german
print "Confidence interval is between", diff_japan_german_upper, diff_japan_german_lower

Confidence interval is between 5.52319533083 1.45499426289


This means we are 95% confident that the difference in mean mpg between Japanese and German cars lies between 5.52 and 1.45

Lets also calculate the p value

In [91]:
z_japan_germany = ((mean_mpg_Japanese_4_cyl - mean_mpg_German_4_cyl) - 0)/se_japan_german
z_japan_germany

3.3619900725888554

We are interested in the right side of the normal curve which is found from the tables to be 0.9996

###This means we are 99.96% confident that our Null hypothesis is correct for Japanese and German cars

Now lets compare 4 cylinder German cars and US cars

In [92]:
se_german_us = np.sqrt(std_mpg_German_4_cyl**2/len(auto_mpg_germany_4_cylinders_df) + std_mpg_US_4_cyl**2/len(auto_mpg_us_4_cylinders_df))
se_german_us

0.97521405181200982

So for a 95% confidence interval of the mean

In [93]:
diff_us_german_upper = (mean_mpg_German_4_cyl - mean_mpg_US_4_cyl) + 1.96 * se_german_us
diff_us_german_lower = (mean_mpg_German_4_cyl - mean_mpg_US_4_cyl) - 1.96 * se_german_us
print "Confidence interval is between", diff_us_german_upper, diff_us_german_lower

Confidence interval is between 2.00493344034 -1.81790564276


This is by no means a conclusive answer and we definitely need to look at the p value

In [94]:
z_us_germany = ((mean_mpg_German_4_cyl - mean_mpg_US_4_cyl) - 0)/se_german_us
z_us_germany

0.095890639203308647

We are interested in the right side of the normal curve which is found from the tables to be 0.5359

###This means we are 53.59% confident that our Null hypothesis is correct for US and German cars



##Test Case II
###For 6 cylinder cars 
####Null hypothesis:
Mean_mpg_Japanese - Mean_mpg_German > 0

and 

Mean_mpg_German - Mean_mpg_US > 0
####Alternative hypothesis
Mean_mpg_Japanese - Mean_mpg_German <= 0

and

Mean_mpg_German - Mean_mpg_US <= 0

In [95]:
mean_mpg_Japanese_6_cyl = auto_mpg_japan_6_cylinders_df['mpg'].mean()
std_mpg_Japanese_6_cyl = auto_mpg_japan_6_cylinders_df['mpg'].std()
print mean_mpg_Japanese_6_cyl, std_mpg_Japanese_6_cyl

23.8833333333 4.95193564848


In [96]:
mean_mpg_German_6_cyl = auto_mpg_germany_6_cylinders_df['mpg'].mean()
std_mpg_German_6_cyl = auto_mpg_germany_6_cylinders_df['mpg'].std()
print mean_mpg_German_6_cyl, std_mpg_German_6_cyl

20.1 7.07436687391


Parameter of interest = mean1 - mean2

So SE = sqrt(SE1 x SE1/N1 + SE2 x SE2/N2)

In [97]:
se_japan_german = np.sqrt(std_mpg_Japanese_6_cyl**2/len(auto_mpg_japan_6_cylinders_df) + std_mpg_German_6_cyl**2/len(auto_mpg_germany_6_cylinders_df))
se_japan_german

4.0741393092420282

So for a 95% confidence interval of the mean

In [98]:
diff_japan_german_upper = (mean_mpg_Japanese_6_cyl - mean_mpg_German_6_cyl) + 1.96 * se_japan_german
diff_japan_german_lower = (mean_mpg_Japanese_6_cyl - mean_mpg_German_6_cyl) - 1.96 * se_japan_german
print "Confidence interval is between", diff_japan_german_upper, diff_japan_german_lower

Confidence interval is between 11.7686463794 -4.20197971278


This is by no means a conclusive answer and we definitely need to look at the p value

In [99]:
z_japan_germany = ((mean_mpg_Japanese_6_cyl - mean_mpg_German_6_cyl) - 0)/se_japan_german
z_japan_germany

0.92862149429966268

From tables p value os 0.8212

###So the Null hypothesis for Japanese and German still holds for 6 cylinders

Now lets compare 6 cylinder engines of German and US make

In [100]:
mean_mpg_US_6_cyl = auto_mpg_us_6_cylinders_df['mpg'].mean()
std_mpg_US_6_cyl = auto_mpg_us_6_cylinders_df['mpg'].std()
print mean_mpg_US_6_cyl, std_mpg_US_6_cyl

19.6452054795 3.39464615632


In [101]:
se_german_us = np.sqrt(std_mpg_German_6_cyl**2/len(auto_mpg_germany_6_cylinders_df) + std_mpg_US_6_cyl**2/len(auto_mpg_us_6_cylinders_df))
se_german_us

3.559427553623939

So for a 95% confidence interval of the mean

In [102]:
diff_us_german_upper = (mean_mpg_German_6_cyl - mean_mpg_US_6_cyl) + 1.96 * se_german_us
diff_us_german_lower = (mean_mpg_German_6_cyl - mean_mpg_US_6_cyl) - 1.96 * se_german_us
print "Confidence interval is between", diff_us_german_upper, diff_us_german_lower

Confidence interval is between 7.43127252565 -6.52168348455


This is by no means a conclusive answer and we definitely need to look at the p value

In [103]:
z_us_germany = ((mean_mpg_German_6_cyl - mean_mpg_US_6_cyl) - 0)/se_german_us
z_us_germany

0.12777181546648197

The right hand side of the curve yields a p value of 0.5478

###So again the Null hypothesis holds for 6 cylinder engines of German and US make

In [105]:
auto_mpg_us_4_cylinders_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
30,28.0,4,140.0,90,2264,15.5,71,1,chevrolet vega 2300
45,22.0,4,140.0,72,2408,19.0,71,1,chevrolet vega (sw)
48,23.0,4,122.0,86,2220,14.0,71,1,mercury capri 2000
55,26.0,4,91.0,70,1955,20.5,71,1,plymouth cricket
57,25.0,4,97.5,80,2126,17.0,72,1,dodge colt hardtop
59,20.0,4,140.0,90,2408,19.5,72,1,chevrolet vega
60,21.0,4,122.0,86,2226,16.5,72,1,ford pinto runabout
79,22.0,4,122.0,86,2395,16.0,72,1,ford pinto (sw)
82,28.0,4,98.0,80,2164,15.0,72,1,dodge colt (sw)
108,21.0,4,140.0,72,2401,19.5,73,1,chevrolet vega
