In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model

In [2]:
auto = pd.read_csv('data/06-automobile.csv')
auto.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,drive-wheels,engine-location,wheel-base,length,...,horsepower,peak-rpm,city-mpg,highway-mpg,price,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon
0,3,115.0,alfa-romero,gas,std,two,rwd,front,88.6,168.8,...,111.0,5000.0,21,27,13495.0,1,0,0,0,0
1,3,115.0,alfa-romero,gas,std,two,rwd,front,88.6,168.8,...,111.0,5000.0,21,27,16500.0,1,0,0,0,0
2,1,115.0,alfa-romero,gas,std,two,rwd,front,94.5,171.2,...,154.0,5000.0,19,26,16500.0,0,0,1,0,0
3,2,164.0,audi,gas,std,four,fwd,front,99.8,176.6,...,102.0,5500.0,24,30,13950.0,0,0,0,1,0
4,2,164.0,audi,gas,std,four,4wd,front,99.4,176.6,...,115.0,5500.0,18,22,17450.0,0,0,0,1,0


## Calculating the MSE and R^2 statistic

In [3]:
regr = linear_model.LinearRegression()
regr.fit(auto[['engine-size']], auto['price'])

LinearRegression()

In [4]:
predictions = regr.predict(auto[['engine-size']])
differences = [(x-y)**2 for (x,y) in zip(predictions, auto['price'])]

In [5]:
MSE = sum(differences) / len(differences)
print("MSE = %3.2f" % (MSE))

MSE = 16050493.31


In [6]:
FVU = MSE / np.var(auto['price']) # Fraction of variance unexplained
R2 = 1 - FVU
print("R2 = %3.2f" % (R2))

R2 = 0.74


## Classifier evaluation

In [7]:
y_class = [(ft=='gas') for ft in auto['fuel-type']]
regr = linear_model.LogisticRegression()
regr.fit(auto[['engine-size']], y_class) 

LogisticRegression()

In [8]:
predictions = regr.predict(auto[['engine-size']])
correct = predictions == y_class

In [9]:
# Calculate the accuracy
accuracy = sum(correct) / len(correct)
print("accuracy = %3.2f" % (accuracy))

accuracy = 0.90


In [10]:
# Calculate true positive, etc
TP = sum([(p and l) for (p,l) in zip(predictions, y_class)])
print("TP = %3.2f" % (TP))

FP = sum([(p and not l) for (p,l) in zip(predictions, y_class)])
print("FP = %3.2f" % (FP))

TN = sum([(not p and not l) for (p,l) in zip(predictions, y_class)])
print("TN = %3.2f" % (TN))

FN = sum([(not p and l) for (p,l) in zip(predictions, y_class)])
print("FN = %3.2f" % (FN))

TP = 185.00
FP = 20.00
TN = 0.00
FN = 0.00


In [11]:
# Calculate the accuracy in another way
acc2 = (TP + TN) / (TP + FP + TN + FN)
print("accuracy = %3.2f" % (acc2))

accuracy = 0.90


In [12]:
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print("precision = %3.2f; recall = %3.2f" % (precision, recall))

precision = 0.90; recall = 1.00


In [13]:
F1 = 2 * (precision*recall) / (precision + recall)
print("F1 = %3.2f" % (F1))

F1 = 0.95
