<a href="https://www.kaggle.com/code/wesleysilvaalves/titanic?scriptVersionId=94778310" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [None]:
#Time Imports
from time import time

#Data Manipulation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline

#Machine Learning Imports
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import  RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import  LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score as cvs, GridSearchCV as gsc
from sklearn.metrics import accuracy_score, precision_score, recall_score

#Ignoring Warnings
import warnings; warnings.filterwarnings('ignore')

# Functions

In [None]:
#Function to check a Column values
def check(col):
    print('Null values:', train[col].isnull().sum())
    print('Total:',train[col].value_counts().sum())
    print('Uniques:',train[col].nunique())
    print(train[col].value_counts())
#easy catplot    
def catplot(col):
    sns.catplot(x=col , y='Survived', data=train, kind='bar', aspect=1.5)
    plt.show()
    
#easy lineplot
def lineplot(col):
    sns.lineplot(x='Survived', y=col, data=train)
    plt.show

#  Function to run  and see results from Machine Learning Algorithms
def algorithm(alg,param):
    algo=gsc(alg(),param,cv=5)
    algo.fit(x_train,y_train)
    print('Best Param Perfomance: {} \n'.format(algo.best_params_))
    model = alg(**algo.best_params_)
    model.fit(x_train,y_train)
    y_pred = model.predict(test)
    accuracy = round(accuracy_score(submission.Survived, y_pred),3)
    precision = round(precision_score(submission.Survived, y_pred),3)
    recall = round(recall_score(submission.Survived, y_pred),3)
    print('Accuracy: {}\nPrecision: {}\nRecall: {}'.format(accuracy,precision,recall))    

In [None]:
#Importing the Dataset
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

# Looking into the Dataset

In [None]:
train.head(20)#seeing the first 20 rows

In [None]:
train.describe()

In [None]:
train.isnull().sum() #verifying null data

In [None]:
msno.matrix(train)#database missing values plot

In [None]:
submission #the feature to predict is Survived

In [None]:
sns.countplot(y='Survived', data=train);
sns.set(style="whitegrid")
total = float(len(train))
ax = sns.countplot(y="Survived", data=train)
plt.title("Survided List on the Titanic's Disaster", fontsize=20)
ax.annotate(ax.patches[0].get_width(), (529,0), ha='left', fontsize = 15)
ax.annotate(ax.patches[1].get_width(), (322,1))
plt.show()


# Train Dataset Optimizing

In [None]:
train = train.drop(['PassengerId'], axis=1) # deleting PassengerId, since it's not necessary for us.

In [None]:
train

In [None]:
check('Pclass')#Check Pclass

In [None]:
#Visualizing the Pclass Column
lineplot('Pclass')
catplot('Pclass')

In [None]:
check('Name') #not any duplicate name
train = train.drop(['Name'], axis=1)# dropping name column

In [None]:
check('Sex')

In [None]:
train['Sex']= np.where(train['Sex'] == 'female', 1,0)# change sex from string to categorical data
train.Sex.head(10)

In [None]:
lineplot('Sex');catplot('Sex')#Visualizing Data on Sex

In [None]:
check('Age')


In [None]:
lineplot('Age')

In [None]:
#filling empty spaces in Age
train.Age = train.Age.fillna(train.Age.mean()) 
train.head(15)


In [None]:
#SibSp = Sibling and/or Spouses
check('SibSp')

In [None]:
lineplot('SibSp');catplot('SibSp')

In [None]:
#Parch = Parents and/or Childs
check('Parch')
lineplot('Parch');catplot('Parch')
  

In [None]:
print(train.Ticket.nunique())#many uniques var in Tickets
sns.lineplot(y=train.Ticket, x=train.Survived) #no correlation founded
plt.show()

In [None]:
train = train.drop(['Ticket'], axis=1)#dropping the Ticket column
train.head(10)

In [None]:
check('Cabin')

sns.lineplot(y=train.Cabin, x=train.Survived)#having a cabin name dont change too much
plt.show()
sns.lineplot(y=train.Cabin.isnull(), x=train.Survived)#don't having a cabin is a really bad situation here
plt.show()


In [None]:
#Tranform Data in Cabin
train.Cabin = np.where(train.Cabin.isnull(),0,1)

In [None]:
lineplot('Cabin');catplot('Cabin')

In [None]:
check('Embarked')# two null values founded


In [None]:
train.Embarked = train.Embarked.ffill()# filling Embarked with the neighbour

In [None]:
catplot('Embarked')

In [None]:
#Histogram for the entire DataSet

for i in ['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']:
    dead = list(train[train['Survived'] == 0][i].dropna())  
    alive = list(train[train['Survived'] == 1][i].dropna())
    sns.histplot(dead, color='r', kde=False, bins=20 )
    sns.histplot(alive, color='b', kde=False, bins=20 )
    plt.legend(['Deceased', 'Survived'])
    plt.title('Histogram by {}'.format(i))
    plt.show()

In [None]:
df_embarked = pd.get_dummies(train.Embarked, prefix= 'Embarked')# converting Erbaked column to numerical

In [None]:
df_embarked.head()

In [None]:
train.drop('Embarked', axis=1, inplace=True)

In [None]:
train = pd.concat([train, df_embarked], axis=1)

In [None]:
train.head(30)

# Fixing the Test Dataset

In [None]:
test.head(15)


In [None]:
test.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
test['Sex']= np.where(test['Sex'] == 'female', 1,0)# change sex from string to categorical 
test.Age = test.Age.fillna(train.Age.mean())
test.Fare = test.Fare.ffill()
test.Cabin = np.where(test.Cabin.isnull(),0,1)
df_embarked = pd.get_dummies(test.Embarked, prefix= 'Embarked')
test.drop('Embarked', axis=1, inplace=True)
test = pd.concat([test, df_embarked], axis=1)

In [None]:
test.head(20)

# Machine Learning Codes

In [None]:
x_train=train.drop('Survived', axis=1)
y_train = train.Survived

In [None]:
print(x_train.head())
print('-'*60)
print(y_train.head())

In [None]:
rfc = RandomForestClassifier
param={'n_estimators': [5, 10, 50, 100], 'max_depth' : [2, 5, 10, 20]}
algorithm(rfc,param)

In [None]:
lr = LogisticRegression
param = {'C':[0.001,0.001,0.1,1,10,100,1000],'max_iter':[100,1000,10000]}
algorithm(lr,param)

In [None]:
svc = LinearSVC
param = {'C': [1.0, 10.0, 100.0], 'max_iter':[100,1000,10000,100000]}
algorithm(svc,param)

In [None]:
knn = KNeighborsClassifier
param = {'n_jobs': [-1],'n_neighbors':[1,5,10,50,100]}
algorithm(knn,param)

In [None]:
gnb = GaussianNB
param={'var_smoothing': np.logspace(0,-9, num=1000)}
algorithm(gnb,param)

In [None]:
sgd=SGDClassifier
param={'max_iter':[100,1000,10000],'n_jobs':[-1],'penalty':['l2'],'alpha':[1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]}
algorithm(sgd,param)

# Saving the bests models

In [None]:
svc = LinearSVC(C=1,max_iter=100000)
svc.fit(x_train,y_train)
svc_pred = svc.predict(test)
accuracy = round(accuracy_score(submission.Survived, svc_pred),3)
precision = round(precision_score(submission.Survived, svc_pred),3)
recall = round(recall_score(submission.Survived, svc_pred),3)
print('Accuracy: {}\nPrecision: {}\nRecall: {}'.format(accuracy,precision,recall))
svc_predict = pd.DataFrame(submission.PassengerId)
svc_predict['Survived']= svc_pred
svc_predict.to_csv('svc_predict.csv', index=False)

In [None]:
lr = LogisticRegression(C=1, max_iter=1000)
lr.fit(x_train,y_train)
lr_pred = lr.predict(test)
lr_predict = pd.DataFrame(submission.PassengerId)
lr_predict['Survived']= lr_pred
lr_predict.to_csv('lr_predict.csv', index=False)

In [None]:
rfc =RandomForestClassifier(max_depth= 5, n_estimators= 50)
rfc.fit(x_train,y_train)
rfc_pred = rfc.predict(test)
rfc_predict = pd.DataFrame(submission.PassengerId)
rfc_predict['Survived']= rfc_pred
rfc_predict.to_csv('rfc_predict.csv', index=False)
