In [6]:
## Importing Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
pd.set_option("display.precision",2)

In [7]:
def get_data():
    ## Data Uploading
    titanic = sns.load_dataset('titanic')
    print(titanic.shape)
    ## Make the copy of the Original Data
    dt = titanic.copy()
    print(dt.head())
    return dt

In [8]:
print("Data Getting Ready")
data = get_data()
print(data.head())

Data Getting Ready
(891, 15)
   survived  pclass     sex   age  sibsp  parch   fare embarked  class    who  \
0         0       3    male  22.0      1      0   7.25        S  Third    man   
1         1       1  female  38.0      1      0  71.28        C  First  woman   
2         1       3  female  26.0      0      0   7.92        S  Third  woman   
3         1       1  female  35.0      1      0  53.10        S  First  woman   
4         0       3    male  35.0      0      0   8.05        S  Third    man   

   adult_male deck  embark_town alive  alone  
0        True  NaN  Southampton    no  False  
1       False    C    Cherbourg   yes  False  
2       False  NaN  Southampton   yes   True  
3       False    C  Southampton   yes  False  
4        True  NaN  Southampton    no   True  
   survived  pclass     sex   age  sibsp  parch   fare embarked  class    who  \
0         0       3    male  22.0      1      0   7.25        S  Third    man   
1         1       1  female  38.0      1

In [9]:
def data_preprocessing(data):
    data.drop(['embarked','who','adult_male','deck'],axis = 1 ,inplace = True)
    data['age'].describe()
    ###sns.boxplot(x = data['age'])
    ##sns.heatmap(data.isnull(),cmap = 'viridis')
    data['age'] = data['age'].fillna(data['age'].mean())
    data['embark_town'].isnull().sum()
    data['sex'].value_counts()
    data['alone'] = data['alone'].astype('int')
    d = {'male':1,'female':0}
    data['sex']= data['sex'].map(d)
    dum1 = pd.get_dummies(data['class'])
    data1 = pd.concat([data,dum1],axis = 1)
    data1.drop(['class'],axis = 1,inplace = True)
    dum2 = pd.get_dummies(data['embark_town'])
    data2 = pd.concat([data1,dum2],axis = 1)
    data2.drop(['embark_town'],axis = 1,inplace = True)
    data2.drop(['pclass','alive'],axis = 1, inplace = True) 
    return data2

In [10]:
print("Data Presprocessing Started")
data2 = data_preprocessing(data)

Data Presprocessing Started


In [11]:
data2

Unnamed: 0,survived,sex,age,sibsp,parch,fare,alone,First,Second,Third,Cherbourg,Queenstown,Southampton
0,0,1,22.0,1,0,7.25,0,0,0,1,0,0,1
1,1,0,38.0,1,0,71.28,0,1,0,0,1,0,0
2,1,0,26.0,0,0,7.92,1,0,0,1,0,0,1
3,1,0,35.0,1,0,53.10,0,1,0,0,0,0,1
4,0,1,35.0,0,0,8.05,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,1,27.0,0,0,13.00,1,0,1,0,0,0,1
887,1,0,19.0,0,0,30.00,1,1,0,0,0,0,1
888,0,0,29.7,1,2,23.45,0,0,0,1,0,0,1
889,1,1,26.0,0,0,30.00,1,1,0,0,1,0,0


In [None]:
def data_model_preparation(data2):
    x = data2.drop('survived',axis = 1)
    st = StandardScaler()
    x_transform = st.fit_transform(x)
    x_transform
    y = data2['survived']
    x_data = pd.DataFrame(x_transform,columns = x.columns)
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test = train_test_split(x_data,y,test_size = 0.3,random_state = 1234)
    return x_train,x_test,y_train,y_test

In [18]:
def model_building(x_train,x_test,y_train,y_test):
    model_log = LogisticRegression()
    model_knn = KNeighborsClassifier(n_neighbors=3)
    model_ds = DecisionTreeClassifier()
    model_rf = RandomForestClassifier()
    print("Model Training Started")
    model_log_train = model_log.fit(x_train,y_train)
    model_knn_train = model_knn.fit(x_train,y_train)
    model_ds_train = model_ds.fit(x_train,y_train)
    model_rf_train = model_rf.fit(x_train,y_train)
    print("Model Prediction Started")
    model_log_predict = model_log_train.predict(x_test)
    model_knn_predict = model_knn_train.predict(x_test)
    model_ds_predict = model_ds_train.predict(x_test)
    model_rf_predict = model_rf_train.predict(x_test)
    
    print("Confusion matrix and accuracy")
    acc_log = accuracy_score(model_log_predict,y_test)*100
    print("The Accuracy of Logistic Regression is",acc_log)
    acc_knn = accuracy_score(model_knn_predict,y_test)*100
    print("The Accuracy of Knn is",acc_knn)
    acc_ds = accuracy_score(model_ds_predict,y_test)*100
    print("The Accuracy of Decision Tree is",acc_ds)
    acc_rf = accuracy_score(model_rf_predict,y_test)*100
    print("The Accuracy of Random Forest is",acc_rf)
    return acc_log,acc_knn,acc_ds,acc_rf

In [19]:
acc_log,acc_knn,acc_ds,acc_rf = model_building(x_train,x_test,y_train,y_test)

Model Training Started
Model Prediction Started
Confusion matrix and accuracy
The Accuracy of Logistic Regression is 83.5820895522388
The Accuracy of Knn is 78.35820895522389
The Accuracy of Decision Tree is 78.73134328358209
The Accuracy of Random Forest is 82.08955223880598
