# Importing the Libraries

In [1]:
import numpy
import pandas as pd


In [2]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

# Importing the DataSet

In [3]:
train_data = pd.read_csv(r"C:\Users\Rupankar\Desktop\Kaggle\Titanic Problem\Data\train.csv")
test_data = pd.read_csv(r"C:\Users\Rupankar\Desktop\Kaggle\Titanic Problem\Data\test.csv")
data=[train_data , test_data]


In [4]:
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [5]:
women = train_data.loc[train_data.Sex=='female']["Survived"]
rate=sum(women)/len(women)

print(rate)

0.7420382165605095


In [6]:
men = train_data.loc[train_data.Sex =='male']["Survived"]
rate_men = sum(men)/len(men)

print(rate_men)

0.18890814558058924


# Missing Values



In [7]:
age_ref=pd.DataFrame(data= [train_data.groupby('Pclass')['Age'].mean()], columns = train_data['Pclass'].unique())
age_ref

Unnamed: 0,3,1,2
Age,25.14062,38.233441,29.87763


In [8]:
def fill_age(pclass,age):
    if pd.isnull(age):
        return age_ref[pclass]
    else:
        return age
for d in data:
    d['Age']= train_data.apply(lambda x: fill_age(x['Pclass'],x['Age']),axis=1)
for d in data:
    print(d.isnull().sum())
    

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [9]:
def fill_fare(fare):
    if pd.isnull(fare):
        return train_data['Fare'].mean()
    else:
        return fare
    
def fill_embark(embarked):
    if pd.isnull(embarked):
        return train_data['Embarked'].mode().iloc[0]
    else:
        return embarked
for d in data:
    d['Fare']=train_data.apply(lambda x : fill_fare(x['Fare']),axis=1)
    d['Embarked']=train_data.apply(lambda x: fill_embark(x['Embarked']),axis=1)

for d in data:
    print(d.isnull().sum())
    

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


In [10]:
for d in data:
    d.drop(['Cabin'],axis=1,inplace=True)

In [11]:
for d in data:
    print(d.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


# Feature Scaling


In [12]:
title_list=list()
for d in data:
    for title in d['Name']:
        title=title.split('.')[0].split(',')[1]
        title_list.append(title)
    d['Title']=title_list
    title_list=list()

In [13]:
for d in data:
    print(d['Title'].value_counts())


 Mr              517
 Miss            182
 Mrs             125
 Master           40
 Dr                7
 Rev               6
 Col               2
 Major             2
 Mlle              2
 Jonkheer          1
 Capt              1
 Sir               1
 Mme               1
 Don               1
 Ms                1
 the Countess      1
 Lady              1
Name: Title, dtype: int64
 Mr        240
 Miss       78
 Mrs        72
 Master     21
 Rev         2
 Col         2
 Dona        1
 Ms          1
 Dr          1
Name: Title, dtype: int64


In [14]:
train_data['Title']=train_data['Title'].replace([' Dr',' Rev',' Col',' Mlle',' Major',' Sir',' Lady',' Capt',' Mme',' Ms',' Don',' the Countess',' Jonkheer'],' Others')
train_data['Title'].value_counts()


 Mr        517
 Miss      182
 Mrs       125
 Master     40
 Others     27
Name: Title, dtype: int64

In [15]:
test_data['Title']=test_data['Title'].replace([' Col',' Rev',' Dr',' Dona',' Ms'], ' Others')
test_data['Title'].value_counts()

 Mr        240
 Miss       78
 Mrs        72
 Master     21
 Others      7
Name: Title, dtype: int64

In [16]:
def get_size(df):
    if df['SibSp']+df['Parch']+1==1:
        return 'Single'
    if df['SibSp']+df['Parch']+1>1:
        return 'Small'
    if df['SibSp']+df['Parch']+1>4:
        return 'Big'
for d in data:
    d['FamilySize']=d.apply(get_size,axis=1)
    
for d in data:
    d['IsAlone']=1
    d['IsAlone'].loc[d['FamilySize']!='Single']=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


# Data PreProcessing 

In [17]:
sex = pd.get_dummies(train_data['Sex'])
embark = pd.get_dummies(train_data['Embarked'])
title = pd.get_dummies(train_data['Title'])
Pclass = pd.get_dummies(train_data['Pclass'])
FamilySize = pd.get_dummies(train_data['FamilySize'])

sex2 = pd.get_dummies(test_data['Sex'])
embark2 = pd.get_dummies(test_data['Embarked'])
title2 = pd.get_dummies(test_data['Title'])
Pclass2 = pd.get_dummies(test_data['Pclass'])
FamilySize2 = pd.get_dummies(test_data['FamilySize'])

for d in data:
    d.drop(['Sex','Embarked','Name','Ticket','Title','FamilySize'],axis=1,inplace=True)
    
train_data = pd.concat([sex,embark,train_data,title,FamilySize],axis=1)
test_data = pd.concat([sex2,embark2,test_data,title2,FamilySize2],axis=1)

In [18]:
X = train_data.drop('Survived',axis=1)
y = train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [19]:
scaler = MinMaxScaler()

scaler.fit(X_train)

scaler.transform(X_train)
scaler.transform(X_test)
scaler.transform(test_data)

array([[0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [20]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)


In [21]:
print(classification_report(y_test,y_pred))
print('\n')
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.95      0.86        99
           1       0.92      0.69      0.79        80

    accuracy                           0.83       179
   macro avg       0.85      0.82      0.82       179
weighted avg       0.85      0.83      0.83       179



[[94  5]
 [25 55]]


In [22]:
predictions = model.predict(test_data)
pred_list = [int(x) for x in predictions]

test2 = pd.read_csv(r"C:\Users\Rupankar\Desktop\Kaggle\Titanic Problem\Data\test.csv")
output = pd.DataFrame({'PassengerId': test2['PassengerId'], 'Survived': pred_list})
output.to_csv('MySubmission3.csv', index=False)

In [23]:
test_data


Unnamed: 0,female,male,C,Q,S,PassengerId,Pclass,Age,SibSp,Parch,Fare,IsAlone,Master,Miss,Mr,Mrs,Others,Single,Small
0,0,1,0,0,1,892,3,22,0,0,7.2500,1,0,0,1,0,0,1,0
1,1,0,1,0,0,893,3,38,1,0,71.2833,0,0,0,0,1,0,0,1
2,0,1,0,0,1,894,2,26,0,0,7.9250,1,0,0,1,0,0,1,0
3,0,1,0,0,1,895,3,35,0,0,53.1000,1,0,0,1,0,0,1,0
4,1,0,0,0,1,896,3,35,1,1,8.0500,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,1,0,0,1,1305,3,[29.87763005780347],0,0,0.0000,1,0,0,1,0,0,1,0
414,1,0,0,0,1,1306,1,44,0,0,7.9250,1,0,0,0,0,1,1,0
415,0,1,0,0,1,1307,3,[25.14061971830986],0,0,8.0500,1,0,0,1,0,0,1,0
416,0,1,0,0,1,1308,3,34,0,0,32.5000,1,0,0,1,0,0,1,0
