# Predicting the drinking habits of teenagers

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
student = pd.read_csv("../data/student/student.csv", sep=";")
student.rename(columns={'sex':'gender'}, inplace=True)
student['alcohol_index'] = (5*student['Dalc'] + 2*student['Walc'])/7
# Alcohol consumption level
student['acl'] = student['alcohol_index'] <= 2
student['acl'] = student['acl'].map({True: 'Low', False: 'High'})

In [3]:
student.head(3)

Unnamed: 0,school,gender,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,goout,Dalc,Walc,health,absences,G1,G2,G3,alcohol_index,acl
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,1,1,3,4,0,11,11,1.0,Low
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,1,1,3,2,9,11,11,1.0,Low
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,2,2,3,3,6,12,13,12,2.285714,High


In [4]:
features = ['gender','famsize','age','studytime','famrel','goout','freetime','G3']
target = 'acl'

## Important: Scikit-learn only understands numbers!

This is why we need to create what are called "dummy" features or one-hot encoded features.

In [5]:
# For gender: Female will be 0, Male will be 1
student['gender'] = student['gender'].map({'F':0, 'M':1}).astype(int)
# For famsize: 'LE3' - less or equal to 3 will be 0. 'GT3' - greater than 3 will be one
student['famsize'] = student['famsize'].map({'LE3':0, 'GT3':1}).astype(int)
# for acl: 'Low' will be 0, 'High' will be 1
student['acl'] = student['acl'].map({'Low':0, 'High':1}).astype(int)

In [6]:
X = student[features].values
y = student[target].values

### What is the simplest possible model? just predict the most common category!

In [7]:
student['acl'].value_counts(normalize=True)

0    0.744222
1    0.255778
Name: acl, dtype: float64

## Logistic Regression Model

The logistic regression is a model that uses the features to calculate the probability of the target variable to belong to the "positive class" (target value being equal to 1)

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
student_classifier_logreg = LogisticRegression(C=2)

In [10]:
student_classifier_logreg.fit(X, y)

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## 4. Model evaluation (naively done)

In [11]:
student['predictions_logreg'] = student_classifier_logreg.predict(X)

In [12]:
confusion_matrix = pd.crosstab(student['predictions_logreg'], student['acl'])
confusion_matrix

acl,0,1
predictions_logreg,Unnamed: 1_level_1,Unnamed: 2_level_1
0,453,105
1,30,61


### Accuracy of logistic regression

In [13]:
ac = (confusion_matrix.ix[0,0] + confusion_matrix.ix[1,1])/student.shape[0]
print("Accuracy: {}".format(ac))

Accuracy: 0.7919876733436055


## Let's try out with a more complex model

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
student_classifier_rf = RandomForestClassifier()

In [16]:
student_classifier_rf.fit(X,y)
student['predictions_rf'] = student_classifier_rf.predict(X)

In [17]:
confusion_matrix = pd.crosstab(student['predictions_rf'], student['acl'])
confusion_matrix

acl,0,1
predictions_rf,Unnamed: 1_level_1,Unnamed: 2_level_1
0,480,9
1,3,157


In [18]:
ac = (confusion_matrix.ix[0,0] + confusion_matrix.ix[1,1])/student.shape[0]
print("Accuracy: {}".format(ac))

Accuracy: 0.9815100154083205


In [20]:
# ['gender', 'famsize', 'age', 'studytime', 'famrel', 'goout', 'freetime', 'G3']
new_student = np.array([[0, 1, 18, 2, 1, 5, 5, 16]])
prediction = student_classifier_rf.predict(new_student)
print("The model predicts that the student belongs to the:")
if prediction == 1:
    print("HIGH Alcohol Consumption group")
else:
    print("LOW Alcohol Consumption group")

The model predicts that the student belongs to the:
LOW Alcohol Consumption group
