In [None]:
#Data Analysis and Visualisation
import numpy as np 
import pandas as pd 
import seaborn as sns
import sklearn
import matplotlib as mlt 
import matplotlib.pyplot as plt
import os
import missingno

#Data Preprossesing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler

#Machine Learning 
import catboost
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool, cv

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

#ignoring the warnings
import warnings
warnings.filterwarnings('ignore')

import math, time, random, datetime

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
sample_data=pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [None]:
train_data.info()

In [None]:
train_data.Embarked[train_data.Embarked.isnull()==True]=list(train_data['Embarked'].mode())[0]
train_data['Embarked'][train_data.Embarked.isnull()==True]

In [None]:
train=train_data.drop(['Survived'], axis=1)
combined = train.append(test_data)
combined.reset_index(drop=True, inplace=True)

In [None]:
combined.Cabin.fillna('U', inplace=True)

In [None]:
cabin = set()

for c in combined['Cabin']:
    try:
        cabin.add(c[0])
    except:
        cabin.add('U')
print(cabin)

In [None]:
combined['Cabin'] = combined['Cabin'].map(lambda c: c[0])

In [None]:
combined.Fare.fillna(combined.Fare.mean(), inplace=True)

In [None]:
title=[]
for i in combined.Name:
    title.append(i.split(',')[1].split('.')[0].strip())
combined['Title']=title

In [None]:
combined.Title.replace(['Dona','Mrs'], inplace=True)

In [None]:
grouped_train = combined.groupby(['Sex','Pclass','Title'])
grouped_median_train = grouped_train.median()
grouped_median_train = grouped_median_train.reset_index()[['Sex', 'Pclass', 'Title', 'Age']]
grouped_median_train['Age'].fillna(grouped_median_train['Age'].mean(), inplace=True)
grouped_median_train.head()

In [None]:
def fill_age(row):
    condition = (
        (grouped_median_train['Sex'] == row['Sex']) & 
        (grouped_median_train['Title'] == row['Title']) & 
        (grouped_median_train['Pclass'] == row['Pclass'])
    ) 
    return grouped_median_train[condition]['Age'].values[0]


def process_age():
    global combined
    # a function that fills the missing values of the Age variable
    combined['Age'] = combined.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    return combined

combined = process_age()

In [None]:
combined.reset_index(drop=True, inplace=True)

In [None]:
combined['Age']=pd.cut(combined['Age'], bins=16)

In [None]:
combined['Fare']=pd.cut(combined['Fare'], bins=6)

In [None]:
combined.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)

In [None]:
combined['Relative']=combined['SibSp']+combined['Parch']

In [None]:
combined.info()

In [None]:
le=LabelEncoder()

In [None]:
for column in combined.columns:
    combined[column]=le.fit_transform(combined[column])

In [None]:
combined['Fare_Per_Person']=(combined['Fare']/(combined['Relative']+1)).astype(int)
combined['Age_Class']= combined['Age']* combined['Pclass']

In [None]:
combined.info()

In [None]:
combined.head()

In [None]:
train_df=combined[:891]
test_df=combined[891:]

In [None]:
X_train = train_df
y_train = train_data['Survived']
X_test  = test_df

In [None]:
X_train

In [None]:
def fit_ml_algo(algo, X_train, y_train, cv):
    
    # One Pass
    model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train) * 100, 2)
    
    
    # Cross Validation 
    train_pred = model_selection.cross_val_predict(algo, 
                                                  X_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    # Cross-validation accuracy metric
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
    
    return train_pred, acc, acc_cv

In [None]:

# Logistic Regression
start_time = time.time()
train_pred_log, acc_log, acc_cv_log = fit_ml_algo(LogisticRegression(), 
                                                               X_train, 
                                                               y_train, 
                                                                    10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

In [None]:
# Random Forest Classifier
start_time = time.time()
train_pred_rfc, acc_rfc, acc_cv_rfc = fit_ml_algo(RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1), 
                                                   X_train, y_train, 10)
rfc_time = (time.time() - start_time)
print("Accuracy: %s" % acc_rfc)
print("Accuracy CV 10-Fold: %s" % acc_cv_rfc)
print("Running Time: %s" % datetime.timedelta(seconds=rfc_time))

In [None]:
# k-Nearest Neighbours
start_time = time.time()
train_pred_knn, acc_knn, acc_cv_knn = fit_ml_algo(KNeighborsClassifier(), 
                                                  X_train, 
                                                  y_train, 
                                                  10)
knn_time = (time.time() - start_time)
print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10-Fold: %s" % acc_cv_knn)
print("Running Time: %s" % datetime.timedelta(seconds=knn_time))

In [None]:

# Gaussian Naive Bayes
start_time = time.time()
train_pred_gaussian, acc_gaussian, acc_cv_gaussian = fit_ml_algo(GaussianNB(), 
                                                                      X_train, 
                                                                      y_train, 
                                                                           10)
gaussian_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gaussian)
print("Accuracy CV 10-Fold: %s" % acc_cv_gaussian)
print("Running Time: %s" % datetime.timedelta(seconds=gaussian_time))

In [None]:
# Linear SVC
start_time = time.time()
train_pred_svc, acc_linear_svc, acc_cv_linear_svc = fit_ml_algo(LinearSVC(),
                                                                X_train, 
                                                                y_train, 
                                                                10)
linear_svc_time = (time.time() - start_time)
print("Accuracy: %s" % acc_linear_svc)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svc)
print("Running Time: %s" % datetime.timedelta(seconds=linear_svc_time))

In [None]:
# Stochastic Gradient Descent
start_time = time.time()
train_pred_sgd, acc_sgd, acc_cv_sgd = fit_ml_algo(SGDClassifier(), 
                                                  X_train, 
                                                  y_train,
                                                  10)
sgd_time = (time.time() - start_time)
print("Accuracy: %s" % acc_sgd)
print("Accuracy CV 10-Fold: %s" % acc_cv_sgd)
print("Running Time: %s" % datetime.timedelta(seconds=sgd_time))

In [None]:
# Decision Tree Classifier
start_time = time.time()
train_pred_dt, acc_dt, acc_cv_dt = fit_ml_algo(DecisionTreeClassifier(), 
                                                                X_train, 
                                                                y_train,
                                                                10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dt)
print("Accuracy CV 10-Fold: %s" % acc_cv_dt)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))

In [None]:
# Gradient Boosting Trees
start_time = time.time()
train_pred_gbt, acc_gbt, acc_cv_gbt = fit_ml_algo(GradientBoostingClassifier(), 
                                                                       X_train, 
                                                                       y_train,
                                                                       10)
gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbt)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbt)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))

In [None]:
# Neural Networks
start_time = time.time()
clf = MLPClassifier(activation='tanh',
                    hidden_layer_sizes=(5, 5), random_state=3, max_iter=1000, warm_start=True)

train_pred_nn, acc_nn, acc_cv_nn = fit_ml_algo(clf, X_train, y_train, 10)
nn_time = (time.time() - start_time)
print("Accuracy: %s" % acc_nn)
print("Accuracy CV 10-Fold: %s" % acc_cv_nn)
print("Running Time: %s" % datetime.timedelta(seconds=nn_time))

In [None]:
# perceptron
start_time = time.time()
perceptron = Perceptron(max_iter=5)
train_pred_pt, acc_pt, acc_cv_pt = fit_ml_algo(perceptron, X_train, y_train, 10)
pt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_pt)
print("Accuracy CV 10-Fold: %s" % acc_cv_pt)
print("Running Time: %s" % datetime.timedelta(seconds=pt_time))

In [None]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 'Random Forest Classifier',
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Gradient Boosting Trees','NN', 'Perceptron'],
    'Score': [
        acc_knn, 
        acc_log,  
        acc_gaussian, 
        acc_rfc,
        acc_sgd, 
        acc_linear_svc, 
        acc_dt,
        acc_gbt,
        acc_nn,
        acc_pt
    ]})
print("---Reuglar Accuracy Scores---")
models.sort_values(by='Score', ascending=False)

In [None]:
models = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression', 'Naive Bayes', 'Random Forest Classifier',
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'Gradient Boosting Trees','NN', 'Perceptron'],
    'Score': [
        acc_cv_knn, 
        acc_cv_log,  
        acc_cv_gaussian, 
        acc_cv_rfc,
        acc_cv_sgd, 
        acc_cv_linear_svc, 
        acc_cv_dt,
        acc_cv_gbt,
        acc_cv_nn,
        acc_cv_pt
    ]})
print("---Reuglar Accuracy Scores---")
models.sort_values(by='Score', ascending=False)

In [None]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, X_train, y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

In [None]:
rf.fit(X_train, y_train)

In [None]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(rf.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head(15)

In [None]:
importances.plot.bar()

In [None]:
X_train.drop(["Fare_Per_Person", 'Fare', 'Parch'], axis=1, inplace=True)
X_test.drop(["Fare_Per_Person", 'Fare', 'Parch'], axis=1, inplace=True)

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, oob_score = True)
random_forest.fit(X_train, y_train)

acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
print(round(acc_random_forest,2,), "%")

In [None]:
print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

In [None]:
'''"n_estimators": [100, 400, 700, 1000, 1500]'''

In [None]:
param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10, 25, 50, 70], "min_samples_split" : [2, 4, 10, 12, 16, 18, 25, 35], "n_estimators": [100, 200] }
from sklearn.model_selection import GridSearchCV, cross_val_score
rf = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
clf = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)


In [None]:
start_time = time.time()


In [None]:
clf.fit(X_train, y_train)


In [None]:
pt_time = (time.time() - start_time)
print("Running Time: %s" % datetime.timedelta(seconds=pt_time))

In [None]:
clf.best_params_

In [None]:
random_forest = RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 5, 
                                       min_samples_split = 35,   
                                       n_estimators=200, 
                                       max_features='auto', 
                                       oob_score=True, 
                                       random_state=1, 
                                       n_jobs=-1)

random_forest.fit(X_train, y_train)
Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, y_train)

print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
predictions = cross_val_predict(random_forest, X_train, y_train, cv=3)
confusion_matrix(y_train, predictions)

In [None]:
from sklearn.metrics import precision_score, recall_score

print("Precision:", precision_score(y_train, predictions))
print("Recall:",recall_score(y_train, predictions))

In [None]:
from sklearn.metrics import f1_score
f1_score(y_train, predictions)

In [None]:
from sklearn.metrics import precision_recall_curve

# getting the probabilities of our predictions
y_scores = random_forest.predict_proba(X_train)
y_scores = y_scores[:,1]

precision, recall, threshold = precision_recall_curve(y_train, y_scores)
def plot_precision_and_recall(precision, recall, threshold):
    plt.plot(threshold, precision[:-1], "r-", label="precision", linewidth=5)
    plt.plot(threshold, recall[:-1], "b", label="recall", linewidth=5)
    plt.xlabel("threshold", fontsize=19)
    plt.legend(loc="upper right", fontsize=19)
    plt.ylim([0, 1])

plt.figure(figsize=(14, 7))
plot_precision_and_recall(precision, recall, threshold)
plt.show()

In [None]:
def plot_precision_vs_recall(precision, recall):
    plt.plot(recall, precision, "g--", linewidth=2.5)
    plt.ylabel("recall", fontsize=19)
    plt.xlabel("precision", fontsize=19)
    plt.axis([0, 1.5, 0, 1.5])

plt.figure(figsize=(14, 7))
plot_precision_vs_recall(precision, recall)
plt.show()

In [None]:
from sklearn.metrics import roc_curve
# compute true positive rate and false positive rate
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, y_scores)
# plotting them against each other
def plot_roc_curve(false_positive_rate, true_positive_rate, label=None):
    plt.plot(false_positive_rate, true_positive_rate, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'r', linewidth=4)
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate (FPR)', fontsize=16)
    plt.ylabel('True Positive Rate (TPR)', fontsize=16)

plt.figure(figsize=(14, 7))
plot_roc_curve(false_positive_rate, true_positive_rate)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
r_a_score = roc_auc_score(y_train, y_scores)
print("ROC-AUC-Score:", r_a_score)

In [None]:
random_forest = RandomForestClassifier(criterion = "gini", 
                                       min_samples_leaf = 5, 
                                       min_samples_split = 35,   
                                       n_estimators=100, 
                                       max_features='auto', 
                                       oob_score=True, 
                                       random_state=1, 
                                       n_jobs=-1)
random_forest.fit(X_train, y_train)
Y_prediction = random_forest.predict(X_test)


In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': Y_prediction})

In [None]:
output['Survived'] = output['Survived'].astype(int)
output.to_csv('my_submission.csv', index=False)

In [None]:
submissions_check = pd.read_csv("my_submission.csv")
submissions_check.head()

In [None]:
'''param_grid = { "activation" : ['tanh', 'relu'],'hidden_layer_sizes':[(4,5), (5,5), (6,6)]   ,  "alpha" : [0.0001, 0.001, 0.01, 0.1], "max_iter" : [100, 200, 400, 800]}
from sklearn.model_selection import GridSearchCV, cross_val_score '''

In [None]:
'''nn = MLPClassifier(random_state=3, warm_start=True)
clf = GridSearchCV(estimator=nn, param_grid=param_grid)'''