# Titanic Comp

#### Refrances 
- https://www.kaggle.com/code/startupsci/titanic-data-science-solutions


In [42]:
import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Exploration

In [43]:
test = pd.read_csv("/kaggle/input/titanic/test.csv")
train = pd.read_csv("/kaggle/input/titanic/train.csv")
train.head()

In [44]:
train.info()

In [45]:
test.info()

In [46]:
train = train.drop(['Name','Ticket','Cabin'], axis=1)
test = test.drop(['Name','Ticket','Cabin'], axis=1)

In [47]:
train.dropna(subset=['Embarked'], inplace=True)

In [48]:
train.describe(include=np.number)

In [49]:
train.describe(include=['O'])

## Feature Engineering

In [50]:
fam = train.SibSp + train.Parch
train["Family"] = fam

In [51]:
fam = test.SibSp + test.Parch
test["Family"] = fam

In [52]:
train.head()

In [53]:
data = [train,test]
for dataset in data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['Family'] == 0, 'IsAlone'] = 1

In [54]:
train.head()

In [55]:
train.Family.value_counts()

In [56]:
train.IsAlone.value_counts()

# EDA

In [57]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

def barplot_it(col):
    sns.barplot(train[col].unique(),train[col].value_counts()).set_title(col)
    plt.show()

In [58]:
train.Survived.value_counts().plot.pie(labels=["Not Survived","Survived"],autopct='%1.1f%%')

In [59]:
sns.displot(train, x="Age",kind="kde")
sns.displot(test, x="Age",kind="kde")

plt.show()

In [60]:
fig, axes = plt.subplots(1,2, figsize=(12,5))

sns.histplot(data=train, x="Family", ax= axes[0], kde=True, discrete = True,stat="percent")
sns.histplot(data=test, x="Family", ax= axes[1],  kde=True,  discrete = True,stat="percent")

axes[0].set_title('Train Family')
axes[1].set_title('Test Family')

plt.tight_layout()

In [61]:
sns.boxplot(data=train, x="Fare")
plt.title("Train Fare Data")

In [62]:
sns.boxplot(data=test, x="Fare")
plt.title("Test Fare Data")


In [63]:
features = ["Pclass","Sex","Embarked","IsAlone"]
for col in features:
    barplot_it(col)

In [64]:
fig, axes = plt.subplots(2,2, figsize=(12,5))

sns.countplot(data=train, x="Pclass", hue="Survived", ax= axes[0][0])
sns.countplot(data=train, x="Embarked", hue="Survived", ax= axes[0][1])
sns.countplot(data=train, x="Sex", hue="Survived", ax= axes[1][0])
sns.countplot(data=train, x="IsAlone", hue="Survived", ax= axes[1][1])

axes[0][0].set_title('Pclass')
axes[0][1].set_title('Embarked')
axes[1][0].set_title('Sex')
axes[1][1].set_title('IsAlone')

plt.tight_layout()

# Data Wrangling

In [65]:
mean_Age = (train.Fare.mean()+test.Fare.mean())/2
train.Fare.fillna(mean_Age, inplace=True)
test.Fare.fillna(mean_Age, inplace=True)

In [66]:
mean_fare = (train.Age.mean()+test.Age.mean())/2
train.Age.fillna(mean_fare, inplace=True)
test.Age.fillna(mean_fare, inplace=True)

In [67]:
train.info()
print('_'*40)
test.info()

In [68]:
AgeBand = pd.cut(train.Age, 8)
AgeBand.value_counts()

In [69]:
data = [test,train]

In [70]:
for dataset in data:
    dataset.loc[ dataset['Age'] <= 10, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 10) & (dataset['Age'] <= 20), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 20) & (dataset['Age'] <= 30), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 30) & (dataset['Age'] <= 40), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 50), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 50) & (dataset['Age'] <= 60), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 60) & (dataset['Age'] <= 70), 'Age'] = 6
    dataset.loc[(dataset['Age'] > 70) & (dataset['Age'] <= 80), 'Age'] = 7

In [71]:
train.info()
print('_'*40)
test.info()

In [72]:
test.head()

In [73]:
FareBand = pd.cut(train.Fare, 20)
FareBand.value_counts()

In [74]:
for dataset in data:
    dataset.loc[ dataset['Fare'] <= 25, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 25) & (dataset['Fare'] <= 50), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 50), 'Fare'] = 2

In [75]:
train.head()

In [76]:
for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

In [77]:
train.info()
print('_'*40)
test.info()

In [78]:
for dataset in data:
    dataset['Sex'] = dataset['Sex'].map({'male': 0, 'female': 1})

In [79]:
test.head()

In [80]:
train.info()
print('_'*40)
test.info()

In [81]:
train.Age  = train.Age.astype(int)
train.Fare  = train.Fare.astype(int)

In [82]:
test.Age  = test.Age.astype(int)
test.Fare  = test.Fare.astype(int)

In [83]:
train.info()
print('_'*40)
test.info()

# Models

In [84]:
#importing Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification

## Data preparation

In [85]:
y = train["Survived"]
features = ["Pclass","Sex",'SibSp','Parch']

X=pd.get_dummies(train[features])
X_test=pd.get_dummies(test[features])

In [86]:
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,ConfusionMatrixDisplay

def matrix_it(model,X,y):
    model.fit(X,y)
    pred_ = model.predict(X)
    #creating confusion matrix to know the errors
    conf = confusion_matrix(y, pred_ ,normalize="all")
    disp = ConfusionMatrixDisplay(conf).plot(cmap=plt.cm.PuBuGn)
    print(model.score(X, y))
    

In [87]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(X)
SX = scaler.transform(X)
SX_test = scaler.transform(X_test)

## 1-SVC

In [88]:
svc = SVC()
matrix_it(svc,X,y)

#### using bangging

In [89]:
#bX, by = make_classification(n_samples=889, n_features=4,
#                               n_informative=2, n_redundant=0,
#                               random_state=0, shuffle=True)

bsvm = BaggingClassifier(base_estimator=svc,
                            n_estimators=100, random_state=0)
matrix_it(bsvm,SX,y)

## 2-Random Forest

In [90]:
rf = RandomForestClassifier(n_estimators=250, max_depth=7, random_state=1)
matrix_it(rf,X,y)

## 3-KNN

In [91]:
KNN = KNeighborsClassifier()
matrix_it(KNN,X,y)

## 4-Decision Tree 

In [92]:
Decision_Tree = DecisionTreeClassifier() 
matrix_it(Decision_Tree,X,y)

## OUTPUT

In [93]:
bX_test, by_test = make_classification(n_samples=418, n_features=3,
                               n_informative=2, n_redundant=0,
                               random_state=0, shuffle=True)


predictions = rf.predict(X_test)
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission_best_Res.csv', index=False)
print("Your submission of was successfully saved!")