In [1]:
#By: Amr Ezzat 
!pip install tpot
!pip install xgboost



In [2]:
import tpot
#Had a previous error, had to import XGboost model 
import xgboost 
from tpot import  TPOTClassifier

#importing ML Packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from __future__ import print_function

#Load EDA packages
import pandas as pd
import numpy as np



In [3]:
path = 'https://raw.githubusercontent.com/PaoloMissier/DataScience-class-demos/master/DATA/'
td = pd.read_csv(path + 'titanic.csv', header=0)
pred_feat = 'Class'
seed = 10

In [4]:
td.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [5]:
print ("Rows     : " ,td.shape[0])
print ("Columns  : " ,td.shape[1])

Rows     :  887
Columns  :  8


In [6]:
#Check the data type of variables (Only Numerical values are accepted by TPOT)
td.dtypes

Survived                     int64
Pclass                       int64
Name                        object
Sex                         object
Age                        float64
Siblings/Spouses Aboard      int64
Parents/Children Aboard      int64
Fare                       float64
dtype: object

In [7]:
#Importing the packages we need to convert into numerical values
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder

In [8]:
#We will not change name, we will just get remove it later on 
categorical_columns = ['Sex']
column_trans = make_column_transformer((OrdinalEncoder(), categorical_columns))

titanic_transformed=column_trans.fit_transform(td)

In [9]:
#Copying new data back into td 
titanic_trans = td.copy()
titanic_trans = pd.DataFrame(titanic_transformed, columns=categorical_columns)
td.update(titanic_trans)

In [None]:
#Checking if the conversion has been successful
td

In [11]:
#Sex has been converted successfully but it is still seen as an object 
td.dtypes

Survived                     int64
Pclass                       int64
Name                        object
Sex                         object
Age                        float64
Siblings/Spouses Aboard      int64
Parents/Children Aboard      int64
Fare                       float64
dtype: object

In [12]:
#Making sure Sex data type is changed 
td.iloc[:, 3] = pd.to_numeric(td.iloc[:, 3], errors='coerce') 

In [13]:
#Sex data type changed successfully. 
td.dtypes

Survived                     int64
Pclass                       int64
Name                        object
Sex                        float64
Age                        float64
Siblings/Spouses Aboard      int64
Parents/Children Aboard      int64
Fare                       float64
dtype: object

In [14]:
#Checking for missing data
td.isnull().sum()

Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

In [15]:
td.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard',
       'Parents/Children Aboard', 'Fare'],
      dtype='object')

In [16]:
# Methods and Attributes 
dir(tpot)

['TPOTClassifier',
 'TPOTRegressor',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_version',
 'base',
 'builtins',
 'config',
 'decorators',
 'driver',
 'export_utils',
 'gp_deap',
 'gp_types',
 'main',
 'metrics',
 'operator_utils',
 'tpot']

In [17]:
#Arrange data (Remove 'Name' and 'Survived', set 'Survived' as the value we want to predict)
td_X = td.drop("Survived", axis=1)
td_X = td_X.drop("Name", axis=1)
td_y = td['Survived']

In [18]:
#Split in train and test
X_train, X_test, y_train, y_test = train_test_split(td_X, td_y, train_size=0.8, test_size=0.2)

In [19]:
#Checking the split was done properly
X_train

Unnamed: 0,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
128,3,1.0,45.0,0,0,6.9750
869,3,1.0,47.0,0,0,9.0000
455,2,0.0,50.0,0,0,10.5000
685,3,1.0,18.0,0,0,7.7958
178,3,1.0,36.0,0,0,0.0000
...,...,...,...,...,...,...
15,2,0.0,55.0,0,0,16.0000
456,3,1.0,17.0,0,0,7.7500
640,3,1.0,32.0,0,0,56.4958
323,1,0.0,36.0,0,0,135.6333


In [None]:
### AUTOML TPOT INIT/Fit data/Score
tpot =  TPOTClassifier(generations=10, population_size=30, verbosity=2, random_state = 45)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=330.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.8152432324443112

Generation 2 - Current best internal CV score: 0.8152432324443112


In [None]:
print(tpot.score(X_test, y_test))

In [None]:
tpot.fitted_pipeline_

In [None]:
tpot.fit(X_train, y_train)

In [None]:
exctracted_best_model = tpot.fitted_pipeline_.steps[-1][1]

In [None]:
exctracted_best_model.fit(X_train, y_train) 

In [None]:
exctracted_best_model.feature_importances_

In [None]:
import matplotlib.pyplot as plt
import matplotlib

In [None]:
positions= range(exctracted_best_model.feature_importances_.shape[0])
p1 = plt.bar(positions, exctracted_best_model.feature_importances_)
plt.show()

In [None]:
td.columns

In [None]:
tpot.export('tpot_titanic_pipeline.py')

In [None]:
!cat tpot_titanic_pipeline.py