In [1]:
# %% [code]
import numpy as np # linear algebra
import pandas as pd # data processing, 
import os
import sys
import shutil
from scipy import stats # for Box-Cox Transformation
from mlxtend.preprocessing import minmax_scaling # for min_max scaling

import matplotlib.pyplot as plt # data visualization
import seaborn as sns # Statistical library for data visualization

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,roc_auc_score,log_loss
from sklearn.svm import SVC,LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier

# %% [code]
#The function check null value in the dataset
def checkNullValue(df):
    missing_data=df.isnull()
    for column in missing_data.columns.values.tolist():
        print(column)
        print(missing_data[column].value_counts(),missing_data[column].dtype)
    return 0

# %% [code]
'''The fuction replace null value with average value of it's column if datatype is (int,float) or
with most ferequent value in that column if column datatype is (object)'''
def dataCleansing(df):
    print(df.columns)
    for column in df.columns:
        if(df[column].dtype=='int64' or df[column].dtype=='float64'):
            mean_column=df[column].mean()
            df[column].replace(np.nan,mean_column,inplace=True)
        elif(df[column].dtype=='object'):
            frequent=df1[column].value_counts().idxmax()
            df[column].replace(np.nan,frequent,inplace=True)
    return df

# %% [code]
#This fuction convert categorical value to a numerical value
def dummyFunc(df,list1):
    for column in df.columns:
        if(column in list1):
            dummy=pd.get_dummies(df[column],prefix=column)
            df=pd.concat([df,dummy],axis=1)
            df.drop(column,inplace=True,axis=1)
    return df

# %% [code]
#This function convert string title to numerical value using mapping
def createTitle(df):
    df['Title']=df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
    df=df.drop(['Name'],axis=1)
    title_mapping={'Mr':1,'Miss':2,'Mrs':3,'Master':1,'Lady':2,'Sir':1,'Capt':1,'Major':1}
    df['Title_num']=df['Title'].map(title_mapping)
    df['Title_num']=df['Title_num'].fillna(0)
    df['Title_num']=df['Title_num'].map(int)
    return df 

# %% [code]
#This function convert age column from continous value to discrete value using if-else statement
def ageRange(df):
    #p=sns.FacetGrid(df,col='Survived')
    #p.map(plt.hist,'Age',bins=20)
    #df['AgeRange'] = pd.cut(df['Age'], 6)
    #df[['AgeRange','Survived']].groupby(['AgeRange'],as_index=False).mean().sort_values(by='AgeRange',ascending=True)
    
    df.loc[df['Age']<=13.68,'AgeRange']=1
    df.loc[(df['Age']>13.68)&(df['Age']<=26.94),'AgeRange']=2
    df.loc[(df['Age'] >26.94) & (df['Age']<=40.21),'AgeRange']=3   
    df.loc[(df['Age'] >40.21) & (df['Age']<=53.47),'AgeRange']=4
    df.loc[(df['Age'] >53.47) & (df['Age']<=66.73),'AgeRange']=5
    df.loc[(df['Age'] >66.73) & (df['Age']<=80.0),'AgeRange']=6
    df.loc[(df['Age']>80.0),'AgeRange']=0
    return df

# %% [code]
#In this function we try to find family number for each person,call createTitles and ageRange Func
def column_manipulation(df):
    df['Family']=df['SibSp']+df['Parch']+1
    df=df.drop(['SibSp','Parch'],axis=1)
    return df

# %% [code]
#This function call dataCleansing and dummyFunc for the dataset
def dataFrameManu(df,list1):
    #print("description about the data frame")
    #print(df.describe(include="all"))
    #to count null value in each column
    checkNullValue(df)
    df_cleansing=dataCleansing(df)
    df_dummy=dummyFunc(df_cleansing,list1)
    df_title=createTitle(df_dummy)
    df_age=ageRange(df_title)
    new_df=column_manipulation(df_age)
    return new_df

# %% [code]
if __name__=="__main__":
    df1=pd.read_csv("../input/titanic/train.csv")
    df2=pd.read_csv("../input/titanic/test.csv")
    # These are categorical features and we need to convert them to numerical features
    list1=['Pclass','Sex','Embarked']
    #survive_gender is a varaible display the mount of female survived and male which are servived
    survive_gender=df1[['Sex','Survived']].groupby(['Sex'],as_index=False).mean().sort_values('Survived',ascending=False)
    print(survive_gender)

# %% [code]
    if not os.path.exists('outputset'):
        os.makedirs('outputset') 
    
        print("processing training set")
        df1_new=dataFrameManu(df1,list1)
        print("Traning set after data preprocessing",df1_new)

        print("processing test set")
        df2_new=dataFrameManu(df2,list1)
        print("Test set after data preprocessing",df2_new)
    
    # save datasets after preprocessed 
    df1_new.to_csv('outputset/Trainset.csv',index=False)
    df2_new.to_csv('outputset/Testset.csv',index=False)

# %% [code]

    df1_new=pd.read_csv('/kaggle/working/outputset/Trainset.csv')
    df2_new=pd.read_csv('/kaggle/working/outputset/Testset.csv')

    # %% [code]
    y_train=df1_new['Survived']
    x_train=df1_new.drop(['Title','Survived','PassengerId','Cabin','Ticket','Age'],axis=1)
    x_test=df2_new.drop(['Title','PassengerId','Cabin','Ticket','Age'],axis=1)

    # %% [code]
    logistic=LogisticRegression()
    logistic.fit(x_train,y_train)
    y_pred=logistic.predict(x_test)
    logistic_acc=round(logistic.score(x_train,y_train)*100,2)
    print('accuracy',logistic_acc)

    # %% [markdown]
    # 

    # %% [code]
    classifier=RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=0)
    classifier.fit(x_train,y_train)
    y_pred_forest=classifier.predict(x_test)
    RForest_acc=round(classifier.score(x_train,y_train)*100,2)
    print('accuracy',RForest_acc)

    # %% [code]
    boost_classifier=AdaBoostClassifier(n_estimators = 100, random_state = 0)
    boost_classifier.fit(x_train,y_train)
    y_pred_boost=boost_classifier.predict(x_test)
    AdaBoost_acc=round(boost_classifier.score(x_train,y_train)*100,2)
    print('accuracy',AdaBoost_acc)

    # %% [code]
    boosting_classifier=GradientBoostingClassifier(n_estimators = 100)
    boosting_classifier.fit(x_train,y_train)
    y_pred_boosting=boosting_classifier.predict(x_test)
    GBoosting_acc=round(boosting_classifier.score(x_train,y_train)*100,2)
    print('accuracy',GBoosting_acc)

    # %% [code]
    SVMachine=SVC()
    SVMachine.fit(x_train,y_train)
    y_pred_SV=SVMachine.predict(x_test)
    SVM_acc=round(SVMachine.score(x_train,y_train)*100,2)
    print('accurancy',SVM_acc)

    # %% [code]
    evaluation={'model':['Logistic','RandomForest','AdaBoost','GradientBoosting','SVM'],'Accuracy':[logistic_acc,RForest_acc,AdaBoost_acc,GBoosting_acc,SVM_acc]}
    model=pd.DataFrame(evaluation)
    model.sort_values('Accuracy',ascending=False)

    # %% [code]
    #os.remove('submission1.csv')

    # %% [code]
    submission1=pd.DataFrame({'PassengerId':df2_new['PassengerId'],'Survived':y_pred_forest})
    submission1.to_csv('submission1.csv',index=False)

# %% [code]
print(submission1)

# %% [code]
submission2=pd.DataFrame({'PassengerId':df2_new['PassengerId'],'Survived':y_pred_boosting})
submission2.to_csv('submission2.csv',index=False)

# %% [code]
submission3=pd.DataFrame({'PassengerId':df2_new['PassengerId'],'Survived':y_pred_boost})
submission3.to_csv('submission3.csv',index=False)

      Sex  Survived
0  female  0.742038
1    male  0.188908
processing training set
PassengerId
False    891
Name: PassengerId, dtype: int64 bool
Survived
False    891
Name: Survived, dtype: int64 bool
Pclass
False    891
Name: Pclass, dtype: int64 bool
Name
False    891
Name: Name, dtype: int64 bool
Sex
False    891
Name: Sex, dtype: int64 bool
Age
False    714
True     177
Name: Age, dtype: int64 bool
SibSp
False    891
Name: SibSp, dtype: int64 bool
Parch
False    891
Name: Parch, dtype: int64 bool
Ticket
False    891
Name: Ticket, dtype: int64 bool
Fare
False    891
Name: Fare, dtype: int64 bool
Cabin
True     687
False    204
Name: Cabin, dtype: int64 bool
Embarked
False    889
True       2
Name: Embarked, dtype: int64 bool
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Traning set after data preprocessing      PassengerId  Survived        Age            Ticket     Fare Cabin 

NameError: name 'df3_new' is not defined