In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns

In [2]:
train_data=pd.read_csv(r"C:\Users\Vishnu\Desktop\datas\titanic\train.csv")
test_data=pd.read_csv(r"C:\Users\Vishnu\Desktop\datas\titanic\test.csv")

# Feature Engineering

##### Checking for imbalanced datd

In [3]:
train_data.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

> Yes it is imbalanced data

##### Checking for missing values

In [4]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

> Age column has 177 missing values and cabin column has 687 missing values and Embarked column has 2 missing values

#####  Removing Unwanted features

In [5]:
useless_features = ['PassengerId','Ticket','Cabin']

train_data.drop(useless_features,axis=1,inplace=True)

In [6]:
train_data.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

# Creating function to do all the feature engineering

In [7]:
# Function to create new feature -Age_Group

def age_group(age):
  
    if age<=20:
        return 'Children'
    elif age<=30:
        return 'Adult'
    elif age <=45:
        return 'Middle Aged'
    elif age<=60:
        return 'Old'
    else:
        return 'Senier Citizen'
    

In [8]:
# Function to extract Title

def get_title(name):
    
    return re.findall(r'([A-Za-z]+)\.',name)[0]

In [996]:
# Function to do all feature engineering on both test and train data

def preprocessing(training_data,testing_data):
    
    # Handling Imbalanced Data - Upsampling
    
    minority_ind=training_data[training_data.Survived==1].index.to_list()
    new_ind=np.random.choice(minority_ind,207)
    up_sampling=train_data.iloc[new_ind]
    training_data=pd.concat([train_data,up_sampling],axis=0)
    
# Step 1 > Droping unwanted features

    useless_features = ['PassengerId','Ticket','Cabin']
    training_data.drop(useless_features,axis=1,inplace=True)
    testing_data.drop(useless_features,axis=1,inplace=True)
    
# Step 2 > Handling missing value

    training_data.Age.fillna(training_data.Age.median(),inplace=True) # Filling the NaN value
    
    testing_data.Age.fillna(testing_data.Age.median(),inplace=True) # Filling the NaN value


    training_data.Embarked.fillna(training_data.Embarked.mode(),inplace=True) # Filling the NaN value
    
    testing_data.Embarked.fillna(testing_data.Embarked.mode(),inplace=True) # Filling the NaN value
    
    
# Step 3 > Handing Categorical features


# One Hot Encoding - Sex
    
    sex=pd.get_dummies(training_data.Sex,drop_first=True)
    training_data.Sex=sex
    
    sex=pd.get_dummies(testing_data.Sex,drop_first=True)
    testing_data.Sex=sex

# One Hot Encoding - Embarked

    embarked_one_hot=pd.get_dummies(training_data.Embarked,drop_first=True)
    training_data=pd.concat([training_data,embarked_one_hot],axis=1)
    training_data.drop('Embarked',axis=1,inplace=True)
    
    embarked_one_hot=pd.get_dummies(testing_data.Embarked,drop_first=True)
    testing_data=pd.concat([testing_data,embarked_one_hot],axis=1)
    testing_data.drop('Embarked',axis=1,inplace=True)
    
    
# Step 3 >>>> Creating New Features -Age Group

    age_groupp=[training_data['Age'].apply(age_group)]
    training_data['Age_Group']=age_groupp[0]
    
    age_groupp=[testing_data['Age'].apply(age_group)]
    testing_data['Age_Group']=age_groupp[0]

# Creating New Features -family_size

    train_data["family_Size"]=train_data["SibSp"]+train_data["Parch"]+1
    
    
# One hot Encoding -Age_Gropu

    Age_Gropu_one_hot=pd.get_dummies(training_data.Age_Group,drop_first=True)
    training_data=pd.concat([training_data,Age_Gropu_one_hot],axis=1)
    training_data.drop('Age_Group',axis=1,inplace=True)
    
    Age_Gropu_one_hot=pd.get_dummies(testing_data.Age_Group,drop_first=True)
    testing_data=pd.concat([testing_data,Age_Gropu_one_hot],axis=1)
    testing_data.drop('Age_Group',axis=1,inplace=True)
    
# Creating New Features -Title

    training_data["Title"]=[training_data.Name.apply(get_title)][0]
    
    training_data['Title'] = training_data['Title'].replace(['Mlle','Lady'], 'Miss')
    training_data['Title'] = training_data['Title'].replace('Ms', 'Miss')
    training_data['Title'] = training_data['Title'].replace(['Mme','Sir'], 'Mrs')
    training_data['Title'] = training_data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don',
                                                   'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    training_data.drop('Name',axis=1,inplace=True)
    
    
    testing_data["Title"]=[testing_data.Name.apply(get_title)][0]
    
    testing_data['Title'] = testing_data['Title'].replace(['Mlle','Lady'], 'Miss')
    testing_data['Title'] = testing_data['Title'].replace('Ms', 'Miss')
    testing_data['Title'] = testing_data['Title'].replace(['Mme','Sir'], 'Mrs')
    testing_data['Title'] = testing_data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don',
                                                   'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    testing_data.drop('Name',axis=1,inplace=True)
    
    
# One hot Encoding -Title

    Title_one_hot=pd.get_dummies(training_data.Title,drop_first=True)
    training_data=pd.concat([training_data,Title_one_hot],axis=1)
    training_data.drop('Title',axis=1,inplace=True)
    
    Title_one_hot=pd.get_dummies(testing_data.Title,drop_first=True)
    testing_data=pd.concat([testing_data,Title_one_hot],axis=1)
    testing_data.drop('Title',axis=1,inplace=True)
    

    
    return training_data,testing_data

In [1130]:
# Reading training and testing file
train_data=pd.read_csv(r"C:\Users\Vishnu\Desktop\datas\titanic\train.csv")
test_data=pd.read_csv(r"C:\Users\Vishnu\Desktop\datas\titanic\test.csv")

In [1131]:
# getting clean data 
X,y=preprocessing(train_data,test_data)

In [1298]:
X.Survived.value_counts()

1    549
0    549
Name: Survived, dtype: int64

### Dimensionality Reduction

In [1032]:
# from sklearn.decomposition import PCA

# def reduce_dimenstion(X,no_com):
#     pca_model=PCA(n_components=no_com)
#     Components=pca_model.fit_transform(X)
#     return Components


In [1045]:
# pca_X=reduce_dimenstion(X.iloc[:,1:],10)

# Selecting Best model

In [1171]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.decomposition import PCA

In [1230]:
# Spliting the data

X_train,X_test,y_train,y_test=train_test_split(X.iloc[:,1:],X.iloc[:,0].values,test_size=0.05)


In [1275]:
def best_models(x_train,y_train,x_test,y_test):
    
    All_model=[]
    result={}
    prediction=[]
    
    Decision_Tree = DecisionTreeClassifier(max_depth=13)
    Decision_Tree.fit(x_train,y_train)
    predicted=Decision_Tree.predict(x_test)
    result['Decision_Tree']=sum(predicted==y_test)/len(y_test)
    prediction.append(predicted)
    
    GB_model = GradientBoostingClassifier(n_estimators=200,max_depth=13)
    GB_model.fit(x_train,y_train)
    predicted=GB_model.predict(x_test)
    result['GB_model']=sum(predicted==y_test)/len(y_test)
    prediction.append(predicted)
    
    XGB_model = xgboost.XGBClassifier(n_estimators=300,use_label_encoder=False,gamma=0.9,n_jobs=15,
                                     learning_rate=0.5, max_delta_step=0, max_depth=15)
    XGB_model.fit(x_train,y_train)
    predicted=XGB_model.predict(x_test)
    result['XGB_model']=sum(predicted==y_test)/len(y_test)
    prediction.append(predicted)
    
    RF_model = RandomForestClassifier(n_estimators=300)
    RF_model.fit(x_train,y_train)
    predicted=RF_model.predict(x_test)
    result['RF_model']=sum(predicted==y_test)/len(y_test)
    prediction.append(predicted)
    
    new=np.swapaxes(prediction,0,1)
    new_prediction=[]
    
    All_model.extend([Decision_Tree,GB_model,XGB_model,RF_model])
    for i in new:
        result1=Counter(i).most_common(1)
        new_prediction.append(result1[0][0])
    result['final_accuracy']=(sum(np.array(new_prediction)==y_test)/len(y_test))
    return result,All_model

In [1276]:
result,models_=best_models(X_train,y_train,X_test,y_test)



In [1277]:
result

{'Decision_Tree': 0.9636363636363636,
 'GB_model': 0.9636363636363636,
 'XGB_model': 0.8909090909090909,
 'RF_model': 0.9636363636363636,
 'final_accuracy': 0.9818181818181818}

### Saving the trained model using pickle

In [1278]:
# Storing the model in pickle

import pickle

titanic_trained_best = open(r'C:\Users\Vishnu\Desktop\jupyter Projects\Pickle_Data\titanic_trained_best.pickle',mode='wb')

In [1279]:
pickle.dump(models_,titanic_trained_best)

In [1280]:
titanic_trained.close()

In [1281]:
new_model=open(r'C:\Users\Vishnu\Desktop\jupyter Projects\Pickle_Data\titanic_trained_best.pickle','rb')

In [1282]:
new_model=pickle.load(new_model)

### Making prediction using multiple models

In [1290]:
def predict_function(X_test,models):
    predictions=[]
    for i in models:
        predicted=i.predict(X_test)
        predictions.append(predicted)
    new_predicted=np.swapaxes(predictions,0,1)
    
    final_result=[]
    for i in new_predicted:
        _result=Counter(i).most_common(1)
        final_result.append(_result[0][0])
    return final_result

In [None]:
# There is a missing value in Fare column in test data set 
y.Fare=y.Fare.fillna(y.Fare.median())

In [1292]:
# Reducing dimension of test data
# y=reduce_dimenstion(y.values,13)

### Making predictions

In [1293]:
titanic_predition=predict_function(y,models_)

### Storing the predictions in a CSV file

In [1294]:
test_data=pd.read_csv(r"C:\Users\Vishnu\Desktop\datas\titanic\test.csv")

In [1295]:
result=pd.DataFrame({"PassengerId":test_data.PassengerId,"Survived":titanic_predition})

In [1296]:
result.to_csv(r"C:\Users\Vishnu\Desktop\jupyter Projects\Titanic_prediction\submission.csv",index=False)