In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [2]:
data= pd.read_csv('titanic.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
data = data.drop(['Cabin'], axis=1)

In [6]:
mode_embarked = data['Embarked'].mode()
data['Embarked'] = data['Embarked'].fillna(mode_embarked[0])

In [7]:
# writing function to impute age

def age_impute(cols):
    Age= cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass==1:
            return 38.23
        elif Pclass ==2:
            return 29.87
        else:
            return 25.14
        
    else:
        return Age

In [8]:
data['Age'] = data[['Age','Pclass']].apply(age_impute, axis=1)

In [9]:
data = data.drop(['Name','Ticket'], axis=1)

In [10]:
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [11]:
data['Sex']=pd.get_dummies(data.Sex)['female']

In [12]:
data = pd.get_dummies(data,columns=['Embarked'])
data = data.drop(['Embarked_S'], axis=1)

In [13]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q
0,1,0,3,0,22.00,1,0,7.2500,0,0
1,2,1,1,1,38.00,1,0,71.2833,1,0
2,3,1,3,1,26.00,0,0,7.9250,0,0
3,4,1,1,1,35.00,1,0,53.1000,0,0
4,5,0,3,0,35.00,0,0,8.0500,0,0
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,0,27.00,0,0,13.0000,0,0
887,888,1,1,1,19.00,0,0,30.0000,0,0
888,889,0,3,1,25.14,1,2,23.4500,0,0
889,890,1,1,0,26.00,0,0,30.0000,1,0


In [14]:
y=data.pop('Survived')
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [15]:
X= data
X

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q
0,1,3,0,22.00,1,0,7.2500,0,0
1,2,1,1,38.00,1,0,71.2833,1,0
2,3,3,1,26.00,0,0,7.9250,0,0
3,4,1,1,35.00,1,0,53.1000,0,0
4,5,3,0,35.00,0,0,8.0500,0,0
...,...,...,...,...,...,...,...,...,...
886,887,2,0,27.00,0,0,13.0000,0,0
887,888,1,1,19.00,0,0,30.0000,0,0
888,889,3,1,25.14,1,2,23.4500,0,0
889,890,1,0,26.00,0,0,30.0000,1,0


# Accuracy score function

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y ,random_state=10)

from sklearn.metrics import accuracy_score, confusion_matrix

def print_score(clf,X_train, X_test, y_train, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        print("Train result is: \n =============")
        print(f"Accuracy score is: {accuracy_score(y_train,pred)}\n")
        print(f"Confusion Matrix is: {confusion_matrix(y_train,pred)}\n")
        
    elif train == False:
        pred = clf.predict(X_test)
        print("Test result is: \n =============")
        print(f"Accuracy score is: {accuracy_score(y_test,pred)}\n")
        print(f"Confusion Matrix is: {confusion_matrix(y_test,pred)}\n")

# Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(random_state=10)
log.fit(X_train,y_train)

print_score(log, X_train, X_test, y_train, y_test, train = True)
print_score(log, X_train, X_test, y_train, y_test, train = False)

Train result is: 
Accuracy score is: 0.8053892215568862

Confusion Matrix is: [[352  50]
 [ 80 186]]

Test result is: 
Accuracy score is: 0.8116591928251121

Confusion Matrix is: [[133  14]
 [ 28  48]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Decision Tree

In [18]:
from sklearn.tree import DecisionTreeClassifier
tree= DecisionTreeClassifier(random_state = 42)
tree.fit(X_train,y_train)

print_score(tree, X_train, X_test, y_train, y_test, train = True)
print_score(tree, X_train, X_test, y_train, y_test, train = False)

Train result is: 
Accuracy score is: 1.0

Confusion Matrix is: [[402   0]
 [  0 266]]

Test result is: 
Accuracy score is: 0.7713004484304933

Confusion Matrix is: [[120  27]
 [ 24  52]]



# Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

random_F_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
random_F_classifier.fit(X_train,y_train)

print_score(random_F_classifier, X_train, X_test, y_train, y_test, train = True)
print_score(random_F_classifier, X_train, X_test, y_train, y_test, train = False)

Train result is: 
Accuracy score is: 1.0

Confusion Matrix is: [[402   0]
 [  0 266]]

Test result is: 
Accuracy score is: 0.8565022421524664

Confusion Matrix is: [[136  11]
 [ 21  55]]



# Bagging Classifier

In [20]:
from sklearn.ensemble import BaggingClassifier

bgc=BaggingClassifier(DecisionTreeClassifier(), max_samples= 0.5, max_features=1.0, n_estimators=20, random_state=123)
bgc.fit(X_train,y_train)

print_score(bgc, X_train, X_test, y_train, y_test, train = True)
print_score(bgc, X_train, X_test, y_train, y_test, train = False)

Train result is: 
Accuracy score is: 0.9311377245508982

Confusion Matrix is: [[387  15]
 [ 31 235]]

Test result is: 
Accuracy score is: 0.8385650224215246

Confusion Matrix is: [[134  13]
 [ 23  53]]



# Gradient Boosting Classifier

In [21]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

gboost_model = GradientBoostingClassifier()
gboost_model.fit(X_train,y_train)

print_score(gboost_model, X_train, X_test, y_train, y_test, train = True)
print_score(gboost_model, X_train, X_test, y_train, y_test, train = False)

Train result is: 
Accuracy score is: 0.8982035928143712

Confusion Matrix is: [[388  14]
 [ 54 212]]

Test result is: 
Accuracy score is: 0.8295964125560538

Confusion Matrix is: [[134  13]
 [ 25  51]]



# Adaptive Boosting Classifier(`Adaboost`)

In [22]:
ada_model = AdaBoostClassifier()
ada_model.fit(X_train,y_train)


print_score(ada_model, X_train, X_test, y_train, y_test, train = True)
print_score(ada_model, X_train, X_test, y_train, y_test, train = False)

Train result is: 
Accuracy score is: 0.8308383233532934

Confusion Matrix is: [[351  51]
 [ 62 204]]

Test result is: 
Accuracy score is: 0.8475336322869955

Confusion Matrix is: [[131  16]
 [ 18  58]]

