In [207]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [208]:
#Load the Dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [209]:
#Handle missing values
print(train.isnull().sum())
train['Age'] = train['Age'].fillna(train['Age'].mean())
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
train['Cabin'] = train['Cabin'].fillna(train['Cabin'].mode()[0])
print(train.isnull().sum())

print(test.isnull().sum())
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())
test['Cabin'] = test['Cabin'].fillna(test['Cabin'].mode()[0])
print(test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [210]:
#datatypes of the columns
print(train.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [211]:
#describe the dataset
print("Name\n", train['Name'].describe(), end='\n\n')
print('Sex\n', train['Sex'].describe(), end='\n\n')
print('Ticket\n', train['Ticket'].describe(), end='\n\n')
print('Cabin\n', train['Cabin'].describe(), end='\n\n')
print('Embarked\n', train['Embarked'].describe(), end='\n\n')

Name
 count                         891
unique                        891
top       Braund, Mr. Owen Harris
freq                            1
Name: Name, dtype: object

Sex
 count      891
unique       2
top       male
freq       577
Name: Sex, dtype: object

Ticket
 count        891
unique       681
top       347082
freq           7
Name: Ticket, dtype: object

Cabin
 count         891
unique        147
top       B96 B98
freq          691
Name: Cabin, dtype: object

Embarked
 count     891
unique      3
top         S
freq      646
Name: Embarked, dtype: object



In [212]:
#turning categorical data into numerical data
cols = ['Sex', 'Ticket', 'Cabin', 'Embarked']
train[cols] = train[cols].apply(LabelEncoder().fit_transform)
print(train.dtypes)

test[cols] = test[cols].apply(LabelEncoder().fit_transform)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int32
Age            float64
SibSp            int64
Parch            int64
Ticket           int32
Fare           float64
Cabin            int32
Embarked         int32
dtype: object


In [213]:
#dropping the columns
train = train.drop(['Name'], axis=1)

In [214]:
#determining the correlation
print("Pclass\n", train['Pclass'].corr(train['Survived']), end='\n\n')
print("Sex\n", train['Sex'].corr(train['Survived']), end='\n\n')
print("Age\n", train['Age'].corr(train['Survived']), end='\n\n')
print("SibSp\n", train['SibSp'].corr(train['Survived']), end='\n\n')
print("Parch\n", train['Parch'].corr(train['Survived']), end='\n\n')
print("Ticket\n", train['Ticket'].corr(train['Survived']), end='\n\n')
print("Fare\n", train['Fare'].corr(train['Survived']), end='\n\n')
print("Cabin\n", train['Cabin'].corr(train['Survived']), end='\n\n')
print("Embarked\n", train['Embarked'].corr(train['Survived']), end='\n\n')

Pclass
 -0.3384810359610148

Sex
 -0.5433513806577546

Age
 -0.06980851528714307

SibSp
 -0.03532249888573557

Parch
 0.08162940708348361

Ticket
 -0.16454913400236718

Fare
 0.2573065223849622

Cabin
 0.17569045103556669

Embarked
 -0.16767531386772114



In [215]:
# #splitting the dataset
# y = train['Survived']
# cols = list(train.columns)
# cols.remove('Survived')
# cols.remove('PassengerId')
# print(cols)
# accuracy_scores = []
# #applying the model
# model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
# for i in range(1<<len(cols)):
#     cols_to_drop = []
#     for j in range(len(cols)):
#         if i & (1<<j):
#             cols_to_drop.append(cols[j])
#     X = train.drop(['Survived'], axis=1).drop(cols_to_drop, axis=1)
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     accuracy_scores.append((accuracy_score(y_test, y_pred), cols_to_drop))
# accuracy_scores.sort(reverse=True)
# print(accuracy_scores[0])

In [216]:
#applying the model
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
cols_to_drop = ['Fare']
X_train = train.drop(['Survived']+cols_to_drop, axis=1)
y_train = train['Survived']
model.fit(X_train, y_train)
X_pred = test.drop(['Name']+cols_to_drop, axis=1)
y_pred = model.predict(X_pred)

In [217]:
#accuracy
# print('Accuracy Score:', accuracy_score(y_test, y_pred))

In [218]:
#printing the output
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_pred})
output.to_csv('output.csv', index=False)