In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings

In [2]:
df = pd.read_csv(r'D:\Python\Titanic_comp\notebook\Data\train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
df['Embarked'] = df['Embarked'].fillna('S')
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [5]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [6]:
X = df.drop(columns = ["Survived", "PassengerId", "Name", "Cabin", "Ticket"], axis = 1)
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [7]:
y = df['Survived']
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [8]:
X.shape,y.shape

((891, 7), (891,))

In [9]:
X.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [10]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

stdscaler_transformer = StandardScaler()
onehot_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", onehot_transformer, cat_features),
         ("StandardScaler", stdscaler_transformer, num_features),        
    ]
)

In [11]:
print(num_features)
print(cat_features)

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
Index(['Sex', 'Embarked'], dtype='object')


In [12]:
X = preprocessor.fit_transform(X)
X.shape

(891, 10)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((712, 10), (179, 10), (712,), (179,))

In [19]:
X[0]

array([ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        0.82737724, -0.5924806 ,  0.43279337, -0.47367361, -0.50244517])

In [18]:
def evaluate_model(true, predicted):
    score = accuracy_score(true, predicted)
    return score

In [21]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}
model_list = []
score_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    # Evaluate Train and Test dataset
    model_train_accuracy = evaluate_model(y_train, y_train_pred)

    model_test_accuracy = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy Score: {:.2f}".format(model_train_accuracy))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy Score: {:2f}".format(model_test_accuracy))
    score_list.append(model_test_accuracy)
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy Score: 0.80
----------------------------------
Model performance for Test set
- Accuracy Score: 0.798883


K-Neighbors Classifier
Model performance for Training set
- Accuracy Score: 0.87
----------------------------------
Model performance for Test set
- Accuracy Score: 0.810056


Decision Tree
Model performance for Training set
- Accuracy Score: 0.98
----------------------------------
Model performance for Test set
- Accuracy Score: 0.782123


Random Forest Classifier
Model performance for Training set
- Accuracy Score: 0.98
----------------------------------
Model performance for Test set
- Accuracy Score: 0.826816


XGBClassifier
Model performance for Training set
- Accuracy Score: 0.97
----------------------------------
Model performance for Test set
- Accuracy Score: 0.849162


CatBoosting Classifier
Model performance for Training set
- Accuracy Score: 0.90
----------------------------------
Model performance for T

In [22]:
pd.DataFrame(list(zip(model_list, score_list)), columns=['Model_Name', 'accuracy_score']).sort_values(by=["accuracy_score"],ascending=False)

Unnamed: 0,Model_Name,accuracy_score
4,XGBClassifier,0.849162
5,CatBoosting Classifier,0.837989
3,Random Forest Classifier,0.826816
6,AdaBoost Classifier,0.821229
1,K-Neighbors Classifier,0.810056
0,Logistic Regression,0.798883
2,Decision Tree,0.782123


In [23]:
xgb_model = XGBClassifier()
xgb_model = xgb_model.fit(x_train, y_train)
y_pred = xgb_model.predict(x_test)
score = accuracy_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 84.92


In [24]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
495,0,0,0
648,0,0,0
278,0,0,0
31,1,1,0
255,1,0,1
...,...,...,...
780,1,1,0
837,0,0,0
215,1,1,0
833,0,0,0
