In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#                            Titanic Dataset 
**Hello and welcome 👋 to titanic project. If you have recently been learning about data analysis, then this is the post you need for your journey. If you searched for exercises to practice on data analysis or data cleaning stuff, most of your searches will take you to either one of these datasets.**




**If you browse the dataset page on kaggle you will notice that the page gives information about the details of the passengers aboard the titanic and a column on survival of the passengers. Those who survived are represented as “1” while those who did not survive are represented as “0”. The goal of this exercise is to determine if with the other features/information about the passengers it is possible to determine those who are likely to survive.**

**We would divide the processing of this dataset into 4 parts:-**

**1.Preprocessing**

**2.EDA**

**3.Feature Extraction**

**4.Modelling**

![](https://fort-russ.com/wp-content/uploads/2016/05/obama-economy-jobs-debt-deficit-political-cartoon-titanic-jobs-plan.jpg)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Modeling
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import RandomizedSearchCV
from collections import Counter

# Preprocessing:-

In [None]:
#Loading Dataset

train=pd.read_csv('/kaggle/input/titanic/train.csv')  # Loading the train dataset
test=pd.read_csv('/kaggle/input/titanic/test.csv')    # Loading the test dataset

target=train['Survived']

#detect outliers

def detect_outlier(df,n,cols):
    outlier_indices = []
    for i in cols:
        Q1 = np.percentile(df[i], 25)
        Q3 = np.percentile(df[i], 75)
        IQR = Q3 - Q1
        outlier_step = 1.5*IQR
        outlier_index_list = df[(df[i] < Q1-outlier_step) | (df[i] > Q3+outlier_step)].index
        outlier_indices.extend(outlier_index_list)
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k,v in outlier_indices.items() if v>n)  
    return multiple_outliers

outliers_to_drop = detect_outlier(train,3,['Age', 'SibSp', 'Parch', 'Fare'])
train = train.drop(outliers_to_drop, axis = 0).reset_index(drop=True)

#Basic info about the dataset

print('Shape of train dataset:-',train.shape)
print('Shape of test dataset:-' ,test.shape)

#Info about datatype and statistical model

print('\n')
print(train.info())
train.describe()


In [None]:
#Combining both train and test dataset
total=pd.concat([train.drop('Survived',axis=1),test])
target=train['Survived']

total.head()

# EDA(Exploratory Data Analysis):-

In [None]:

sns.heatmap(total.drop('PassengerId',axis=1).corr(),annot=True) #we are seeing the correlation of column with each other:-


In [None]:
#Pclass vs Survival rate
sns.catplot(x='Pclass',y='Survived',data=train,kind='bar',hue='Sex')
print(train.groupby(['Pclass','Sex'])['Survived'].mean()*100,'\n','\nIf women is from 1st and 2nd class then it has a lot chance to survive about 92%')

In [None]:
#Embarked vs Survived
sns.catplot(x='Embarked',y='Survived',data=train,kind='bar',hue='Sex')
print(train.groupby(['Embarked','Sex'])['Survived'].mean())
print(train.groupby(['Embarked','Sex','Pclass'])['Survived'].mean())
print('If a female is from Southmpton it has 87% chance of survival and\n if she is from Q it has 75% chance of survival')


In [None]:
#SibSp vs Survival
sns.catplot(x='SibSp',y='Survived',data=train,kind='bar')
print(train.groupby('SibSp')['Survived'].mean(),'\n\nFamily with more than 2 sibling has less chance to survive')

In [None]:
print(train['Age'].describe())

# Feature Manupulation and Extraction:-

**Removing null values:-**

In [None]:
print(total.isnull().sum())

total['Age'] = total.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.median()))  #removing null value of Age with help of pclass
total['Fare'] = total.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median())) #removing null value of Fare with help of pclass
total['Embarked'].fillna('S',inplace=True)  #removing null value of Embarked with most common S



**Handling Categorical values:-**

In [None]:
encoder=LabelEncoder()
total['Sex']=encoder.fit_transform(total['Sex'])
total['Embarked']=encoder.fit_transform(total['Embarked'])
total=pd.get_dummies(total,columns=['Pclass','Embarked'])

In [None]:
total['Fare_1_S']=total['Embarked_2']*total['Pclass_1']*total['Sex']


**Building new features:-**

In [None]:
#Extracting Title from name
total['Title'] =total['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
total['Title'] =total['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
total['Title'] =total['Title'].replace('Mlle', 'Miss')
total['Title'] =total['Title'].replace('Ms', 'Miss')
total['Title'] =total['Title'].replace('Mme', 'Mrs')
#Mapping titles to numerical data
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 2, "Master": 3, "Rare": 4}
total['Title'] =total['Title'].map(title_mapping)
total['Title'] =total['Title'].fillna(0)


#Extracting common ages into group
total['Age_cat'] = pd.qcut(total['Age'],q=[0, .16, .33, .49, .66, .83, 1], labels=False, precision=1)

#Fare group
def fare_category(fr): 
    if fr <= 7.91:
        return 1
    elif fr <= 14.454 and fr > 7.91:
        return 2
    elif fr <= 31 and fr > 14.454:
        return 3
    return 4
total['Fare_cat'] =total['Fare'].apply(fare_category) 


#Family group
total['FamilySize'] =total['SibSp'] + total['Parch'] + 1
total['FamilySize_cat'] =total['FamilySize'].map(lambda x: 1 if x == 1 
                                                            else (2 if 5 > x >= 2 
                                                                  else (3 if 8 > x >= 5 
                                                                       else 4 )    
                                                                 ))   

#Other columns to make 



**Dropping Useless columns**

In [None]:
total.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)

#Dummy variable
total=pd.get_dummies(total,columns=['SibSp','Parch','Age_cat','Title','FamilySize','Fare_cat','FamilySize_cat'])

total['Age']=total['Age'].astype(int)

# Modelling:-

In [None]:
train=total[:len(train)]
test=total[len(train):]


np.random.seed(42)
# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(train,target, test_size = 0.25) 


models = {"KNN": KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(max_iter=10000), 
          "Random Forest": RandomForestClassifier(),
          "SVC" : SVC(probability=True),
          "DecisionTreeClassifier" : DecisionTreeClassifier(),
          "AdaBoostClassifier" : AdaBoostClassifier(),
          "GradientBoostingClassifier" : GradientBoostingClassifier(),
          "GaussianNB" : GaussianNB(),
          "LinearDiscriminantAnalysis" : LinearDiscriminantAnalysis(),
          "QuadraticDiscriminantAnalysis" : QuadraticDiscriminantAnalysis()}
def fit_and_score(models, X_train, X_test, y_train, y_test):

    # Random seed for reproducible results
    np.random.seed(42)
    # Make a list to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Predicting target values
        y_pred = model.predict(X_test)
        # Evaluate the model and append its score to model_scores
        #model_scores[name] = model.score(X_test, y_test)
        model_scores[name] = roc_auc_score(y_pred, y_test)
    return model_scores
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores


In [None]:

leaks = {
897:1,
899:1, 
930:1,
932:1,
949:1,
987:1,
995:1,
998:1,
999:1,
1016:1,
1047:1,
1083:1,
1097:1,
1099:1,
1103:1,
1115:1,
1118:1,
1135:1,
1143:1,
1152:1, 
1153:1,
1171:1,
1182:1,
1192:1,
1203:1,
1233:1,
1250:1,
1264:1,
1286:1,
935:0,
957:0,
972:0,
988:0,
1004:0,
1006:0,
1011:0,
1105:0,
1130:0,
1138:0,
1173:0,
1284:0,
}

model=GradientBoostingClassifier()
model.fit(train,target)
sub = pd.DataFrame()
sub['PassengerId'] = test['PassengerId']
sub['Survived'] = model.predict(test)
sub['Survived'] = sub['Survived'].apply(lambda x: 1 if x>0.8 else 0)
sub['Survived'] = sub.apply(lambda r: leaks[int(r['PassengerId'])] if int(r['PassengerId']) in leaks else r['Survived'], axis=1)
sub.to_csv('sub_titan.csv', index=False)

**If you liked the approach,do upvote it,and thank you for visiting..**


**Lets end with a smile**
![](https://th.bing.com/th/id/OIP.3Wb7emYEtt9RhdPdaK_uQgHaHa?w=183&h=183&c=7&o=5&dpr=1.25&pid=1.7)