In [None]:
!kaggle competitions download -c titanic

In [None]:
!unzip titanic.zip

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Machine Learning
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB  
from sklearn.metrics import confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [None]:
train_df.info()

In [None]:
train_df.describe(include='all')

In [None]:
pd.isnull(train_df).sum()/len(train_df)*100

In [None]:
train_df['Age'].isnull().sum()

In [None]:
train_df.Survived.value_counts()

In [None]:
train_df.Survived.value_counts()/len(train_df)*100

In [None]:
train_df.columns

In [None]:
train_df.Pclass.value_counts()

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train_df)

Pclass1 = train_df['Survived'][train_df['Pclass']==1].value_counts(normalize=True)[1]*100
Pclass2 = train_df['Survived'][train_df['Pclass']==2].value_counts(normalize=True)[1]*100
Pclass3 = train_df['Survived'][train_df['Pclass']==3].value_counts(normalize=True)[1]*100

print(f"Percentage of Pclass 1 who survived:{Pclass1}")
print(f"Percentage of Pclass 2 who survived:{Pclass2}")
print(f"Percentage of Pclass 3 who survived:{Pclass3}")

In [None]:
train_df.Sex.value_counts()

In [None]:
sns.barplot(x='Sex', y='Survived', data=train_df)

female = train_df['Survived'][train_df['Sex']=='female'].value_counts(normalize=True)[1]*100
male = train_df['Survived'][train_df['Sex']=='male'].value_counts(normalize=True)[1]*100

print(f"Percentage of females who survived:{female}")
print(f"Percentage of males who survived:{male}")

In [None]:
sns.barplot(y='Survived', x='Sex', hue='Pclass', data=train_df)

In [None]:
train_df.Age.value_counts()

In [None]:
train_df.Age.value_counts().nlargest(5).plot.barh()

In [None]:
sns.histplot(x='Age', data=train_df, bins=20)

In [None]:
train_df.SibSp.value_counts()

In [None]:
sns.barplot(x='SibSp', y='Survived', data=train_df)

In [None]:
train_df.Parch.value_counts()

In [None]:
sns.barplot(x='Parch', y='Survived', data=train_df)

In [None]:
sns.distplot(train_df['Fare'], bins=20, kde=True, vertical=False)

In [None]:
train_df.Embarked.value_counts()

In [None]:
sns.barplot(x='Embarked', y='Survived', data=train_df)

embS = train_df['Survived'][train_df['Embarked']=='S'].value_counts(normalize=True)[1]*100
embC = train_df['Survived'][train_df['Embarked']=='C'].value_counts(normalize=True)[1]*100
embQ = train_df['Survived'][train_df['Embarked']=='Q'].value_counts(normalize=True)[1]*100

print(f"Percentage of Embarked S who survived:{embS}")
print(f"Percentage of Embarked C who survived:{embC}")
print(f"Percentage of Embarked Q who survived:{embQ}")

In [None]:
train_df['Embarked'].isnull().sum()

In [None]:
train_df = train_df.fillna({'Embarked':'S'})

In [None]:
train_df['Embarked'].isnull().sum()

In [None]:
train_df['Age'].isnull().sum()

In [None]:
train_df.Age = train_df.Age.fillna(value=train_df.Age.mean())

In [None]:
train_df['Age'].isnull().sum()

In [None]:
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df = pd.get_dummies(train_df, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
X = train_df.drop(['Survived', 'PassengerId'], axis=1)
y = train_df['Survived']

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lst_model = []
lst_accuracy = []
lst_accuracy_train = []
lst_accuracy_test = []
lst_cv_score = []
lst_TP = []
lst_TN = []
lst_FP = []
lst_FN = []

def applyMLmodel(model):
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)*100
    lst_accuracy.append(accuracy)
    print("Accuracy:", accuracy)
    
    cv = cross_val_score(estimator=model, X=X_train, y=y_train.ravel(), cv=10)
    lst_cv_score.append(cv.mean())
    print("CV Score:", cv.mean())
    
    y_pred_train = model.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_pred_train)
    lst_accuracy_train.append(accuracy_train)
    print("Accuracy(Training):", accuracy_train)
    
    y_pred_test = model.predict(X_test)
    accuracy_test = accuracy_score(y_test, y_pred_test)
    lst_accuracy_test.append(accuracy_test)
    print("Accuracy(Test):", accuracy_test)
    
    cm = confusion_matrix(y_test, y_pred_test)
    print("Confusion Matrix:", "\n", cm)
    
    lst_TN.append(cm[0, 0])
    lst_FP.append(cm[0, 1])
    lst_FN.append(cm[1, 0])
    lst_TP.append(cm[1, 1])

In [None]:
model = LogisticRegression()
applyMLmodel(model)
lst_model.append("LogisticRegression")

In [None]:
model = DecisionTreeClassifier()
applyMLmodel(model)
lst_model.append("DecisionTreeClassifier")

In [None]:
modelR = RandomForestClassifier()
applyMLmodel(modelR)
lst_model.append("RandomForestClassifier")

In [None]:
model = KNeighborsClassifier()
applyMLmodel(model)
lst_model.append("KNeighborsClassifier")

In [None]:
model = GaussianNB()
applyMLmodel(model)
lst_model.append("GaussianNB")

In [None]:
predictiondf = pd.DataFrame({"Model": np.array(lst_model),
                             "Accuracy": np.array(lst_accuracy),
                             "Accuracy(Training)": np.array(lst_accuracy_train),
                             "Accuracy(Test)": np.array(lst_accuracy_test),
                             "CV Score": np.array(lst_cv_score),
                             "True Positive": np.array(lst_TP),
                             "True Negative": np.array(lst_TN),
                             "False Positive": np.array(lst_FP),
                             "False Negative": np.array(lst_FN),
                            })
predictiondf

In [None]:
test_df.info()

In [None]:
test_df.isnull().sum()

In [None]:
# Fill null values similar to what we did in training dataset
test_df.Age = test_df.Age.fillna(value=test_df.Age.mean())
# Fill Fare with mode value
test_df.Fare = test_df.Fare.fillna(value=test_df.Fare.mode())

In [None]:
y = train_df["Survived"]

features = ["Pclass", "Sex_male", "SibSp", "Parch"]
X = pd.get_dummies(train_df[features])
X_test = pd.get_dummies(test_df[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
!kaggle competitions submit -c titanic -f submission.csv -m "Message"