In [113]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [115]:
from pandas.tools.plotting import scatter_matrix

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold


from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

#Class, for use in pipelines, to select certain columns from a DataFram
#e and convert to a numpy array
# From A. Geron: Hands-On Machine Learning with Scikit-Learn & TensorFlo
#w, O'Reilly, 2017
# Modified by Derek Bridge to allow for casting in the same ways as pand
#as.DatFrame.astype
class DataFrameSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, attribute_names, dtype=None):
        self.attribute_names = attribute_names
        self.dtype = dtype
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_selected = X[self.attribute_names]
        if self.dtype:
            return X_selected.astype(self.dtype).values
        return X_selected.values
    
    
# Class, for use in pipelines, to binarize nominal-valued features (whil
#e avoiding the dummy variabe trap)
# By Derek Bridge, 2017
class FeatureBinarizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, features_values):
        self.features_values = features_values
        self.num_features = len(features_values)
        self.labelencodings = [LabelEncoder().fit(feature_values) for \
        feature_values in features_values]
        self.onehotencoder = OneHotEncoder(sparse=False, \
        n_values=[len(feature_values) for feature_values in \
                  features_values])
        self.last_indexes = np.cumsum([len(feature_values) - 1 for \
        feature_values in self.features_values])
        
    def fit(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        return self.onehotencoder.fit(X)
    
    def transform(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        onehotencoded = self.onehotencoder.transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    
    def fit_transform(self, X, y=None):
        onehotencoded = self.fit(X).transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    
    def get_params(self, deep=True):
        return {"features_values" : self.features_values}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.setattr(parameter, value)
        return self
    
# Class, for use in pipelines, to impute missing values but which overco
#mes a problem that scikit-learn's class has
# when imputing modes on nominal-valued features
# From https://stackoverflow.com/questions/25239958/impute-categorical-m
#issing-values-in-scikit-learn
# Original has a casting problem

In [116]:
alc_df= pd.read_csv('CS4618Resources/datasets/dataset_alcohol.csv')

In [117]:
alc_df.shape

(76, 9)

In [118]:
alc_df.describe(include = 'all')

Unnamed: 0,age_yrs,height_cm,weight_kg,duration_mins,elapsed_mins,sex,last_meal,units,over_limit
count,76.0,76.0,76.0,76.0,76.0,76,76,76.0,76
unique,,,,16.0,9.0,2,5,,2
top,,,,120.0,10.0,Male,Full,,No
freq,,,,15.0,61.0,60,33,,46
mean,22.657895,176.644737,71.486842,,,,,8.632895,
std,5.627439,8.453329,11.474602,,,,,5.775567,
min,18.0,157.0,47.0,,,,,0.0,
25%,19.0,172.0,63.0,,,,,4.275,
50%,21.0,177.0,72.0,,,,,8.4,
75%,23.0,182.0,79.0,,,,,12.1,


In [119]:
alc_df.dtypes

age_yrs            int64
height_cm          int64
weight_kg          int64
duration_mins     object
elapsed_mins      object
sex               object
last_meal         object
units            float64
over_limit        object
dtype: object

Build a system that will predict if a reveller is under or over the legal alcohol limit. We're a yes or no answer, so we'll use logistic regression.

Note that the data is unordered. Should probably still shuffle, but it's not as necessary.

In this notebook I just removed the values - should try imputing the values too, since the dataset is so small.
In fact : the two people I was going to impute were not drinking at all, so they should simply be removed or the values should be changed to 0.
Instead of removing try replacing them with '0's.

To do:

Change both mins columns to numeric values / investigate if the '?' values are corrupting the numeric data.

Note : I was so fixated on dealing with the question marks that I forgot about the one in the last meal column
Also : I tried to put 0's in, which was wrong. It may have been okay for duration_mins, but it didn't actually make sense for elapsed_mins. 

In [120]:
#This tells you how many ? marks are in the nominal valued data.
for col in ['duration_mins', 'elapsed_mins', 'sex', 'last_meal', 'over_limit']:
    print(col, (alc_df[col] == '?').sum())

duration_mins 2
elapsed_mins 2
sex 0
last_meal 1
over_limit 0


In [121]:
alc_df = (alc_df[alc_df["duration_mins"] != "?"]).copy()
alc_df.reset_index(drop=True, inplace=True)
alc_df = (alc_df[alc_df["elapsed_mins"] != "?"]).copy()
alc_df.reset_index(drop=True, inplace=True)
alc_df = (alc_df[alc_df["last_meal"] != "?"]).copy()
alc_df.reset_index(drop=True, inplace=True)

alc_df.shape


#alc_df.set_value(1,"duration_mins",'0')
#alc_df.set_value(1,"elapsed_mins",'0')
#alc_df.set_value(32,"duration_mins",'0')
#alc_df.set_value(32,"elapsed_mins",'0')

#Now a check to see if the ?'s were filters out
#print(alc_df["duration_mins"].unique())
#print(alc_df["elapsed_mins"].unique())

(73, 9)

In [122]:
#Change duration and elapsed mins to numeric values
d_mins = alc_df["duration_mins"]
d_mins = pd.to_numeric(d_mins)
alc_df["duration_mins"] = d_mins

e_mins = alc_df["elapsed_mins"]
e_mins = pd.to_numeric(e_mins)
alc_df["elapsed_mins"] = e_mins

In [123]:
alc_df.describe(include='all')

Unnamed: 0,age_yrs,height_cm,weight_kg,duration_mins,elapsed_mins,sex,last_meal,units,over_limit
count,73.0,73.0,73.0,73.0,73.0,73,73,73.0,73
unique,,,,,,2,4,,2
top,,,,,,Male,Full,,No
freq,,,,,,58,32,,43
mean,22.479452,176.821918,71.712329,157.39726,16.369863,,,8.926027,
std,5.336222,8.554308,11.558402,100.588924,25.539461,,,5.687926,
min,18.0,157.0,47.0,5.0,5.0,,,1.2,
25%,19.0,172.0,63.0,90.0,10.0,,,4.8,
50%,21.0,177.0,72.0,120.0,10.0,,,9.1,
75%,23.0,182.0,79.0,240.0,10.0,,,12.4,


In [124]:
alc_df = alc_df.take(np.random.permutation(len(alc_df)))


In [125]:
numeric_features = ["age_yrs", "height_cm", "weight_kg", "duration_mins", "elapsed_mins","units"]

nominal_features = ["sex", "last_meal"]

#Numeric Pipeline NOTE: Maybe try min max scaling also?
numeric_pipeline = Pipeline([("selector", DataFrameSelector(numeric_features)),
                            ("scaler", StandardScaler())
                            ])
#Nominal Pipeline NOTE: would have been a good idea to impute the one bad value for last_meal.
nominal_pipeline = Pipeline([("selector", DataFrameSelector(nominal_features)),
                             ("binarizer", FeatureBinarizer([alc_df[feature].unique()\
                                                            for feature in nominal_features]))
                            ])
#Final Pipeline
pipeline = Pipeline([("union", FeatureUnion([("numeric_pipeline", numeric_pipeline),
                                             ("nominal_pipeline", nominal_pipeline)
                                            ])),
                     ("estimator", LogisticRegression())
                    ])

#Should also compare against the majority classifier!
#Should also implement a confusion matrix, to see where we're going wrong.


In [126]:
y = alc_df["over_limit"].values #wrong, should encode answers to 0 and 1. Dumbass.
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

In [129]:
#Holdout - repeated holdout is when you specify n_splits > 1.
ss = ShuffleSplit(n_splits=1, train_size=0.8)
np.mean(cross_val_score(pipeline, alc_df, y_encoded, scoring="neg_mean_absolute_error", cv=ss))
#Cross fold validation isn't great here.
#np.mean(cross_val_score(pipeline, alc_df, y_encoded, scoring="neg_mean_absolute_error", cv=10))



-0.25

In [128]:
#pipeline.fit(alc_df, y_encoded)