In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('Titanic_Survivors_test_data.csv')

In [3]:
data[:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
# Checking for the outliers.

def find_outliers_IQR(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

# identify outliers in the Age column
outliers_B = find_outliers_IQR(data['Age'])
print(outliers_B)

33     66.0
54     65.0
96     71.0
116    70.5
280    65.0
456    65.0
493    71.0
630    80.0
672    70.0
745    70.0
851    74.0
Name: Age, dtype: float64


In [6]:
# Filling the null values of Age and Fare with the median values.

data['Age'].fillna(data['Age'].median(), inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)

In [7]:
# Finding the appropriate Cabin values using Random Forests to fill the null values of the Cabin.

features = ['Pclass','Age','Fare']

data_missing = data[data['Cabin'].isnull()]
data_not_missing = data[~data['Cabin'].isnull()]

In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(data_not_missing[features], data_not_missing['Cabin'])

In [9]:
# Predicting the missing values of the Cabin.

data_missing['Cabin'] = model.predict(data_missing[features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_missing['Cabin'] = model.predict(data_missing[features])


In [10]:
# Concating the Original and Predicted Cabin values for Null values in the datasets.

data = pd.concat([data_missing, data_not_missing], ignore_index=True)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [12]:
data[:3]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,F G73,S
1,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,F G73,S
2,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,E10,S


In [13]:
# Removing the Features Which are not Usefull from the datasets

useless_features = ['PassengerId','Name','Ticket','Cabin']
data.drop(useless_features, axis=1, inplace=True)

In [14]:
data.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [15]:
data.Pclass.unique()

array([3, 2, 1], dtype=int64)

In [16]:
# One_Hot_Encoding is done for categorical Data and added to the original datasets

one_hot_encoded_data = pd.get_dummies(data, columns = ['Pclass','Sex','Embarked'])
data.drop(['Pclass','Sex','Embarked'], axis=1, inplace = True)

# concatinating the columns and removing the duplicated features
data = pd.concat([data, one_hot_encoded_data], axis = 1)
data = data.loc[:, ~data.columns.duplicated()]

In [17]:
X = data.drop('Survived', axis=1)
y = data['Survived']

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state = 29)

# Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7932960893854749


# Random Forests

In [20]:

from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)

random_forest_accuracy = accuracy_score(y_test, y_pred)

print(random_forest_accuracy)

0.8212290502793296


# Grid Search CV

In [21]:
from sklearn.model_selection import GridSearchCV

reg = RandomForestClassifier()

params = {
    'n_estimators' : [100,150,200],
    'min_samples_split' : [5,8,10],
    'max_leaf_nodes' : [None,2,4]
}

grid_search = GridSearchCV(reg, param_grid = params, cv = 5)

grid_search.fit(X_train, y_train)

In [22]:
grid_search.best_params_

{'max_leaf_nodes': None, 'min_samples_split': 10, 'n_estimators': 200}

In [23]:
grid_search.best_estimator_.score(X_test, y_test)

0.8324022346368715