In [2]:
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report

## Introduction to Machine Learning

### Live demos

In [7]:
passenger_data = pd.read_csv("data/train.csv")

In [8]:
passenger_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [9]:
passenger_attributes = passenger_data.drop(columns = ["Transported"])
passenger_labels = passenger_data.Transported

In [10]:
passenger_attributes = passenger_attributes.dropna()

In [11]:
passenger_attributes = passenger_attributes.drop(columns = ["PassengerId", "Name", "Cabin"])

In [12]:
passenger_attributes

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0


In [13]:
pd.get_dummies(passenger_attributes["HomePlanet"], drop_first = True)

Unnamed: 0,Europa,Mars
0,1,0
1,0,0
2,1,0
3,1,0
4,0,0
...,...,...
8688,1,0
8689,0,0
8690,0,0
8691,1,0


In [14]:
passenger_attributes = pd.get_dummies(passenger_attributes, drop_first = True)

In [15]:
passenger_attributes = passenger_attributes.dropna()

In [16]:
passenger_labels = passenger_labels[passenger_attributes.index]

In [18]:
len(passenger_attributes), len(passenger_labels)

(6606, 6606)

In [19]:
model = LogisticRegression()

In [20]:
model.fit(passenger_attributes, passenger_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
model.score(passenger_attributes, passenger_labels)

0.7907962458371177

In [22]:
model.predict(passenger_attributes)

array([ True, False, False, ...,  True, False,  True])

In [23]:
print(classification_report(passenger_labels, model.predict(passenger_attributes)))

              precision    recall  f1-score   support

       False       0.79      0.78      0.79      3279
        True       0.79      0.80      0.79      3327

    accuracy                           0.79      6606
   macro avg       0.79      0.79      0.79      6606
weighted avg       0.79      0.79      0.79      6606



In [24]:
model.coef_

array([[-6.54828584e-03, -1.26700116e-03,  5.90966229e-04,
         6.72194683e-04, -1.77563346e-03, -1.67401736e-03,
         1.48136637e+00,  4.15939031e-01,  1.48876462e+00,
        -1.88728589e-01, -5.98022993e-01, -5.39964271e-03]])

In [25]:
list(zip(passenger_attributes.columns.tolist(), model.coef_[0,:].tolist())) 

[('Age', -0.006548285836551736),
 ('RoomService', -0.001267001162188013),
 ('FoodCourt', 0.0005909662291310543),
 ('ShoppingMall', 0.0006721946827330478),
 ('Spa', -0.001775633462676853),
 ('VRDeck', -0.0016740173554244421),
 ('HomePlanet_Europa', 1.4813663746712173),
 ('HomePlanet_Mars', 0.4159390305677746),
 ('CryoSleep_True', 1.488764620071964),
 ('Destination_PSO J318.5-22', -0.18872858897550096),
 ('Destination_TRAPPIST-1e', -0.5980229932536839),
 ('VIP_True', -0.005399642711852134)]

# Decision trees

In [26]:
tree = DecisionTreeClassifier()

In [27]:
tree.fit(passenger_attributes, passenger_labels)

In [28]:
tree.score(passenger_attributes, passenger_labels)

0.9335452618831366