In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

In [2]:
auto = pd.read_csv('data/06-automobile.csv')
auto.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,drive-wheels,engine-location,wheel-base,length,...,horsepower,peak-rpm,city-mpg,highway-mpg,price,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon
0,3,115.0,alfa-romero,gas,std,two,rwd,front,88.6,168.8,...,111.0,5000.0,21,27,13495.0,1,0,0,0,0
1,3,115.0,alfa-romero,gas,std,two,rwd,front,88.6,168.8,...,111.0,5000.0,21,27,16500.0,1,0,0,0,0
2,1,115.0,alfa-romero,gas,std,two,rwd,front,94.5,171.2,...,154.0,5000.0,19,26,16500.0,0,0,1,0,0
3,2,164.0,audi,gas,std,four,fwd,front,99.8,176.6,...,102.0,5500.0,24,30,13950.0,0,0,0,1,0
4,2,164.0,audi,gas,std,four,4wd,front,99.4,176.6,...,115.0,5500.0,18,22,17450.0,0,0,0,1,0


## Classification and logistic regression

In [3]:
# Create an instance of the model, before fitting it to data
regr = linear_model.LogisticRegression()
regr.fit(auto[['engine-size']], auto['fuel-type'])

LogisticRegression()

In [4]:
# Make predictions from the data
predictions = regr.predict(auto[['engine-size']])

In [5]:
# Check whether they match the labels
correctPredictions = predictions == auto['fuel-type']

In [6]:
# And compute the error
sum(correctPredictions) / len(correctPredictions)

0.9024390243902439

## Classifier evaluation

In [7]:
y_class = [(ft=='gas') for ft in auto['fuel-type']]
regr = linear_model.LogisticRegression()
regr.fit(auto[['engine-size']], y_class) 

LogisticRegression()

In [8]:
predictions = regr.predict(auto[['engine-size']])
correct = predictions == y_class

In [9]:
# Calculate the accuracy
accuracy = sum(correct) / len(correct)
print("accuracy = %3.2f" % (accuracy))

accuracy = 0.90


In [10]:
# Calculate true positive, etc
TP = sum([(p and l) for (p,l) in zip(predictions, y_class)])
print("TP = %3.2f" % (TP))

FP = sum([(p and not l) for (p,l) in zip(predictions, y_class)])
print("FP = %3.2f" % (FP))

TN = sum([(not p and not l) for (p,l) in zip(predictions, y_class)])
print("TN = %3.2f" % (TN))

FN = sum([(not p and l) for (p,l) in zip(predictions, y_class)])
print("FN = %3.2f" % (FN))

TP = 185.00
FP = 20.00
TN = 0.00
FN = 0.00


In [11]:
# Calculate the accuracy in another way
acc2 = (TP + TN) / (TP + FP + TN + FN)
print("accuracy = %3.2f" % (acc2))

accuracy = 0.90


In [12]:
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print("precision = %3.2f; recall = %3.2f" % (precision, recall))

precision = 0.90; recall = 1.00


In [13]:
F1 = 2 * (precision*recall) / (precision + recall)
print("F1 = %3.2f" % (F1))

F1 = 0.95


## Classification with categorical features

In [14]:
# Importing the datasets
df = pd.read_csv('data/05-insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [15]:
# One-Hot encoding the smoker parameter
df_with_dummy = pd.get_dummies(df, columns = ['smoker'])
df_with_dummy.head()

Unnamed: 0,age,sex,bmi,children,region,charges,smoker_no,smoker_yes
0,19,female,27.9,0,southwest,16884.924,0,1
1,18,male,33.77,1,southeast,1725.5523,1,0
2,28,male,33.0,3,southeast,4449.462,1,0
3,33,male,22.705,0,northwest,21984.47061,1,0
4,32,male,28.88,0,northwest,3866.8552,1,0


In [16]:
# Data preparation

x_class = df_with_dummy[['age', 'bmi', 'smoker_no', 'smoker_yes']]
med = df["charges"].median()
y_class = [(chg <= med) for chg in df["charges"]]

In [17]:
# Create an instance of the model, finding the fitting it to data
regr = linear_model.LogisticRegression()
regr.fit(x_class, y_class)

LogisticRegression()

In [18]:
# Make predictions from the data
predictions = regr.predict(x_class)

# Check whether they match the labels
correctPredictions = predictions == y_class

# And compute the error
sum(correctPredictions) / len(correctPredictions)

0.905829596412556

In [19]:
# Converting objects labels into categorical
df['smoker'] = df['smoker'].astype('category')

# Converting category labels into numerical using LabelEncoder
label = LabelEncoder()
label.fit(df.smoker.drop_duplicates())
df.smoker = label.transform(df.smoker)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552


In [20]:
# Data preparation
x_class = df[['age', 'bmi', 'smoker']]

# Create an instance of the model, finding the fitting it to data
regr = linear_model.LogisticRegression()
regr.fit(x_class, y_class)

LogisticRegression()

In [21]:
# Make predictions from the data
predictions = regr.predict(x_class)

# Check whether they match the labels
correctPredictions = predictions == y_class

# And compute the error
sum(correctPredictions) / len(correctPredictions)

0.905829596412556

## Multi-class Classification

In [22]:
# Data preparation
y_class = df['region']

print(df.groupby('region').size())

region
northeast    324
northwest    325
southeast    364
southwest    325
dtype: int64


In [23]:
# Apply scaling
scaler = MinMaxScaler()
X_class = scaler.fit_transform(x_class)

In [24]:
# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_class, y_class)

print('Accuracy of K-NN classifier: {:.2f}'.format(knn.score(X_class, y_class)))

Accuracy of K-NN classifier: 0.53
