In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train.shape

(891, 12)

In [4]:
## categorical features
categorical_features = [feature for feature in train.columns if train[feature].dtype == 'O']
categorical_features

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [5]:
## numrical fearure
numerical_feature = [ feature for feature in train.columns if feature not in categorical_features + ['PassengerId'] ]
numerical_feature

['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [6]:
## descrete features
descrete_features = [ feature for feature in numerical_feature if len(train[feature].unique()) < 20 ]
descrete_features

['Survived', 'Pclass', 'SibSp', 'Parch']

In [7]:
## continous features
continous_features = [ feature for feature in numerical_feature if feature not in descrete_features]
continous_features

['Age', 'Fare']

In [8]:
## filling missing value of Age
train['Age'] = train.Age.fillna(train.Age.median())

## filling missing value of Embarked
train['Embarked'] = train.Embarked.fillna(train.Embarked.mode()[0])

## filling missing value of Cabin
train['Cabin_nan'] = np.where(train.Cabin.isnull(),1,0)
train['Cabin'] = train.Cabin.fillna('Missing')
train['Cabin'] = train.Cabin.astype(str).str[0] 


In [9]:
# converting the Name feature to Rare,Miss,Mrs
train['title'] = train.Name.str.extract(' ([A-Za-z]+)\.',expand=False)
train['title'] = train['title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train['title'] = train['title'].replace('Mlle', 'Miss')
train['title'] = train['title'].replace('Ms', 'Miss')
train['title'] = train['title'].replace('Mme', 'Mrs')

## categorical to numerical
train['Sex'] = train.Sex.map({'female':0,'male':1})

## title dummies
title_dumiess = pd.get_dummies(train.title,drop_first=True)
train = pd.concat([train ,title_dumiess],axis=1)

## Embarked dummies
Embarked_dummies = pd.get_dummies(train.Embarked,drop_first=True)
train = pd.concat([train,Embarked_dummies],axis=1)

## Cabin dummies
Cabin_dummies = pd.get_dummies(train.Cabin ,drop_first=True)
train = pd.concat([train,Cabin_dummies],axis=1)

# dropping the features
train = train.drop(['PassengerId','Name','Ticket','title','Embarked','Cabin'],axis=1)

In [10]:
age_mean = np.mean(train.Age)
age_std = np.std(train.Age)

lower = age_mean - 3*age_std
upper = age_mean + 3*age_std

Age_outlier_removed = [x for x in train.Age if x >= lower and x <= upper ]

for _ in range(0,7):
    Age_outlier_removed.append(train.Age.median())
    
train['Age'] = Age_outlier_removed

In [11]:
train['Fare'] = np.log1p(train.Fare)

fare_mean = np.mean(train.Fare)
fare_std = np.std(train.Fare)

lower = fare_mean - 3*fare_std
upper = fare_mean + 3*fare_std

Fare_outlier_removed = [x for x in train.Fare if x >= lower and x <= upper ]

for _ in range(0,18):
    Fare_outlier_removed.append(train.Fare.mean())
    
train['Fare'] = Fare_outlier_removed

In [12]:
train.to_csv('titanic6_Train.csv')

In [13]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_nan,Miss,Mr,...,Q,S,B,C,D,E,F,G,M,T
0,0,3,1,22.0,1,0,2.110213,1,0,1,...,0,1,0,0,0,0,0,0,1,0
1,1,1,0,38.0,1,0,4.280593,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,3,0,26.0,0,0,2.188856,1,1,0,...,0,1,0,0,0,0,0,0,1,0
3,1,1,0,35.0,1,0,3.990834,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,0,3,1,35.0,0,0,2.202765,1,0,1,...,0,1,0,0,0,0,0,0,1,0


# For Test data

In [14]:
test = pd.read_csv('test.csv')

In [15]:
## filling missing value of Age
test['Age'] = test.Age.fillna(test.Age.median())

## fare
test['Fare'] = test.Fare.fillna(test.Fare.mean())

## filling missing value of Cabin
test['Cabin_nan'] = np.where(test.Cabin.isnull(),1,0)
test['Cabin'] = test.Cabin.fillna('Missing')
test['Cabin'] = test.Cabin.astype(str).str[0]

In [16]:
# converting the Name feature to Rare,Miss,Mrs
test['title'] = test.Name.str.extract(' ([A-Za-z]+)\.',expand=False)
test['title'] = test['title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test['title'] = test['title'].replace('Mlle', 'Miss')
test['title'] = test['title'].replace('Ms', 'Miss')
test['title'] = test['title'].replace('Mme', 'Mrs')

## categorical to numerical
test['Sex'] = test.Sex.map({'female':0,'male':1})

## title dummies
title_dumiess = pd.get_dummies(test.title,drop_first=True)
test = pd.concat([test ,title_dumiess],axis=1)

## Embarked dummies
Embarked_dummies = pd.get_dummies(test.Embarked,drop_first=True)
test = pd.concat([test,Embarked_dummies],axis=1)

## Cabin dummies
Cabin_dummies = pd.get_dummies(test.Cabin ,drop_first=True)
test = pd.concat([test,Cabin_dummies],axis=1)

# dropping the features
test = test.drop(['PassengerId','Name','Ticket','title','Embarked','Cabin'],axis=1)

In [17]:
age_mean = np.mean(test.Age)
age_std = np.std(test.Age)

lower = age_mean - 3*age_std
upper = age_mean + 3*age_std

Age_outlier_removed = [x for x in test.Age if x >= lower and x <= upper ]

Age_outlier_removed.append(test.Age.median())
    
test['Age'] = Age_outlier_removed

In [22]:
test['Fare'] = np.log1p(test.Fare)

fare_mean = np.mean(test.Fare)
fare_std = np.std(test.Fare)

lower = fare_mean - 3*fare_std
upper = fare_mean + 3*fare_std

Fare_outlier_removed = [x for x in test.Fare if x >= lower and x <= upper ]


Fare_outlier_removed.append(test.Fare.mean())
Fare_outlier_removed.append(test.Fare.mean())
    
test['Fare'] = Fare_outlier_removed

In [23]:
test.to_csv('titanic6_Test.csv')

In [24]:
test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_nan,Miss,Mr,Mrs,Rare,Q,S,B,C,D,E,F,G,M
0,3,1,34.5,0,0,0.768381,1,0,1,0,0,1,0,0,0,0,0,0,0,1
1,3,0,47.0,1,0,0.753653,1,0,0,1,0,0,1,0,0,0,0,0,0,1
2,2,1,62.0,0,0,0.795089,1,0,1,0,0,1,0,0,0,0,0,0,0,1
3,3,1,27.0,0,0,0.781275,1,0,1,0,0,0,1,0,0,0,0,0,0,1
4,3,0,22.0,1,1,0.822976,1,0,0,1,0,0,1,0,0,0,0,0,0,1


## Building the Ml model

In [25]:
X = train.drop(['Survived','T'],axis=1)
y = train.Survived

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [26]:
from sklearn.metrics import accuracy_score,classification_report
def training_model(estimater ,X_train,y_train,X_test,y_test):
    model = estimater
    model.fit(X_train,y_train)
    score = model.score(X_train,y_train)
    print(f"\nTrain Accurcy: {score}")
    y_pred = model.predict(X_test)
    print(f"\nTest Accurcy: {accuracy_score(y_test,y_pred)}")
    print(f"\nClassification Report:\n {classification_report(y_test,y_pred)}")

In [27]:
from sklearn.linear_model import LogisticRegression
training_model(LogisticRegression(C=4, penalty='l1', solver='saga'),X_train,y_train,X_test,y_test)


Train Accurcy: 0.800561797752809

Test Accurcy: 0.8156424581005587

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.85       111
           1       0.78      0.72      0.75        68

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179



In [28]:
from sklearn.ensemble import RandomForestClassifier
training_model(RandomForestClassifier(max_depth=5, max_features=6, n_estimators=80),X_train,y_train,X_test,y_test)


Train Accurcy: 0.8567415730337079

Test Accurcy: 0.8268156424581006

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86       111
           1       0.78      0.76      0.77        68

    accuracy                           0.83       179
   macro avg       0.82      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179



In [29]:
from sklearn.tree import DecisionTreeClassifier
training_model(DecisionTreeClassifier(criterion='gini',max_depth=7, min_samples_leaf=7),X_train,y_train,X_test,y_test)


Train Accurcy: 0.8623595505617978

Test Accurcy: 0.8100558659217877

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.90      0.85       111
           1       0.80      0.66      0.73        68

    accuracy                           0.81       179
   macro avg       0.81      0.78      0.79       179
weighted avg       0.81      0.81      0.81       179



In [30]:
from sklearn.svm import SVC
training_model(SVC(C=70, gamma=0.01),X_train,y_train,X_test,y_test)


Train Accurcy: 0.8679775280898876

Test Accurcy: 0.8156424581005587

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85       111
           1       0.75      0.76      0.76        68

    accuracy                           0.82       179
   macro avg       0.80      0.81      0.80       179
weighted avg       0.82      0.82      0.82       179



In [31]:
from xgboost import XGBClassifier
training_model(XGBClassifier(n_estimators=180,gamma=5,max_depth=7,),X_train,y_train,X_test,y_test)


Train Accurcy: 0.8455056179775281

Test Accurcy: 0.8212290502793296

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       111
           1       0.80      0.71      0.75        68

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179



In [39]:
xgb_clf = XGBClassifier(n_estimators=180,gamma=5,max_depth=7)
xgb_clf.fit(X,y)
y_predicted = xgb_clf.predict(test)



In [37]:
rf_clf = RandomForestClassifier(max_depth=5, max_features=6, n_estimators=80)
rf_clf.fit(X,y)
y_predicted = rf_clf.predict(test)

In [40]:
pred = pd.DataFrame(y_predicted)
sub_df = pd.read_csv('gender_submission.csv')
datasets = pd.concat([sub_df['PassengerId'],pred],axis=1)
datasets.columns=['PassengerId','Survived']
datasets.to_csv('titanic6_xgb.csv',index=False)