# Titanic Project

# Data Description
  a. survived - Survival (0 = No; 1 = Yes)
  b. pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
  c. name - Name
  d. sex - Sex
  e. age - Age
  f. sibsp - Number of Siblings/Spouses Aboard
  g. parch - Number of Parents/Children Aboard
  h. ticket - Ticket Number
  i. fare - Passenger Fare
  j. cabin - Cabin
  k. embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
  l. boat - Lifeboat (if survived)
  m. body - Body number (if did not survive and body was recovered)

In [1]:
#import libraries into Python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
#Read test and train data into Python
data1 = pd.read_csv('titanic_train.csv')
data3 = pd.read_csv('titanic_test.csv')

In [3]:
data1.head()

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived
0,1216,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q,13.0,,,1
1,699,3,"Cacic, Mr. Luka",male,38.0,0,0,315089,8.6625,,S,,,Croatia,0
2,1267,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",female,30.0,1,1,345773,24.15,,S,,,,0
3,449,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0,,S,4.0,,"Cornwall / Akron, OH",1
4,576,2,"Veal, Mr. James",male,40.0,0,0,28221,13.0,,S,,,"Barre, Co Washington, VT",0


In [4]:
data3.head()

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,295,1,"Thayer, Mr. John Borland Jr",male,17.0,0,2,17421,110.8833,C70,C,B,,"Haverford, PA"
1,1150,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S,,,
2,89,1,"Davidson, Mr. Thornton",male,31.0,1,0,F.C. 12750,52.0,B71,S,,,"Montreal, PQ"
3,1063,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.125,,S,,,"Finland Sudbury, ON"
4,1020,3,"Minkoff, Mr. Lazar",male,21.0,0,0,349211,7.8958,,S,,,


In [5]:
# Drop unnecessary columns
data2 = data1.drop(['boat','body','home.dest'], axis = 1)
data3 = data3.drop(['boat','body','home.dest'], axis = 1)

In [6]:
data2.head()

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1216,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q,1
1,699,3,"Cacic, Mr. Luka",male,38.0,0,0,315089,8.6625,,S,0
2,1267,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",female,30.0,1,1,345773,24.15,,S,0
3,449,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0,,S,1
4,576,2,"Veal, Mr. James",male,40.0,0,0,28221,13.0,,S,0


# Missing Values

In [58]:
# Look for missing values in each Column
data2.isnull().sum()/len(data2)*100

passenger_id     0.000000
pclass           0.000000
name             0.000000
sex              0.000000
age              0.000000
sibsp            0.000000
parch            0.000000
ticket           0.000000
fare             0.000000
cabin           77.529412
embarked         0.000000
survived         0.000000
Tittle           0.000000
dtype: float64

In [8]:
data2.shape

(850, 12)

In [9]:
data2[data2.embarked.isnull()]

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
109,168,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,1


In [10]:
data2.embarked.value_counts()

S    589
C    176
Q     84
Name: embarked, dtype: int64

In [11]:
pd.crosstab(data2[data2.survived != -88].survived, data2[data2.survived != -88].embarked)

embarked,C,Q,S
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,85,55,397
1,91,29,192


In [13]:
data2.groupby(['pclass','embarked']).fare.median()

pclass  embarked
1       C           79.2000
        Q           90.0000
        S           52.0000
2       C           19.7896
        Q           12.3500
        S           14.2500
3       C            7.2292
        Q            7.7500
        S            8.0500
Name: fare, dtype: float64

In [14]:
# Missing value in embarked is replaced by 'C' since we considered Fare and Class of the passenger
data2.embarked.fillna("c",inplace = True)

In [15]:
data3.embarked.fillna("c",inplace = True)

In [16]:
data2.isnull().sum()

passenger_id      0
pclass            0
name              0
sex               0
age             174
sibsp             0
parch             0
ticket            0
fare              1
cabin           659
embarked          0
survived          0
dtype: int64

In [17]:
data2[data2.fare.isnull()]

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
416,1225,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,0


In [18]:
median_fare = data2.loc[(data2.pclass == 3) & (data2.embarked == 'S'), 'fare'].median()
median_fare

8.05

In [19]:
median_fare1 = data3.loc[(data3.pclass == 3) & (data3.embarked == 'S'), 'fare'].median()
median_fare1

8.05

In [20]:
# Missing value in Fare is replaced by median fare of people in class 3 who embarked from Southamton.
data2.fare.fillna(median_fare, inplace = True)

In [21]:
data3.fare.fillna(median_fare, inplace = True)

In [22]:
data2.isnull().sum()

passenger_id      0
pclass            0
name              0
sex               0
age             174
sibsp             0
parch             0
ticket            0
fare              0
cabin           659
embarked          0
survived          0
dtype: int64

In [23]:
data2.name

0                                     Smyth, Miss. Julia
1                                        Cacic, Mr. Luka
2      Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...
3                  Hocking, Mrs. Elizabeth (Eliza Needs)
4                                        Veal, Mr. James
                             ...                        
845                          Hipkins, Mr. William Edward
846                              Kent, Mr. Edward Austin
847                  Kantor, Mrs. Sinai (Miriam Sternin)
848                            Peacock, Miss. Treasteall
849                                Greenberg, Mr. Samuel
Name: name, Length: 850, dtype: object

In [24]:
# Creating a function to split title from name to handle missing values in age
def Gettittle(name):
    first_name = name.split(',')[1]
    tittle = first_name.split('.')[0]
    tittle = tittle.strip().lower()
    return tittle


In [25]:
data2.name.map( lambda x : Gettittle(x))
data3.name.map( lambda x : Gettittle(x))

0        mr
1        mr
2        mr
3        mr
4        mr
       ... 
454      mr
455      mr
456    miss
457      mr
458    miss
Name: name, Length: 459, dtype: object

In [26]:
data3.name.map( lambda x : Gettittle(x))

0        mr
1        mr
2        mr
3        mr
4        mr
       ... 
454      mr
455      mr
456    miss
457      mr
458    miss
Name: name, Length: 459, dtype: object

In [27]:
data2.name.map(lambda x : Gettittle(x)).unique()
data3.name.map(lambda x : Gettittle(x)).unique()

array(['mr', 'master', 'miss', 'mrs', 'dr', 'mlle', 'rev', 'ms', 'col',
       'don', 'lady', 'dona'], dtype=object)

In [28]:
data3.name.map(lambda x : Gettittle(x)).unique()

array(['mr', 'master', 'miss', 'mrs', 'dr', 'mlle', 'rev', 'ms', 'col',
       'don', 'lady', 'dona'], dtype=object)

In [29]:
def Gettittle2(name) :
    tittle_group = { 'mr' : 'Mr',
               'mrs' : 'Mrs',
              'miss' : 'Miss',
             'master' : 'Master',
              'dr' : 'Officer',
              'ms' : 'Mrs',
              'mme' : 'Mrs',
              'col' : 'Officer',
              'sir' : 'Sir',
                'lady' : 'Lady',
              'rev' : 'Sir',
              'major' : 'Officer',
              'jonkheer' : 'Sir',
              'capt' : 'Officer',
              'the countess' : 'Lady',
              'mlle' : 'Miss',
                   'dona' : 'Lady',
                    'don' : 'Mr'}
    
    first_name = name.split(',')[1]
    tittle = first_name.split('.')[0]
    tittle = tittle.strip().lower()
    return tittle_group[tittle]

In [30]:
# Creating a new column Tittle so each missing value in age can be replaced by mean age with the same tittle
data3['Tittle'] = data3.name.map(lambda x : Gettittle2(x))

In [31]:
data2['Tittle'] = data2.name.map(lambda x : Gettittle2(x))

In [32]:
data3.head()

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,Tittle
0,295,1,"Thayer, Mr. John Borland Jr",male,17.0,0,2,17421,110.8833,C70,C,Mr
1,1150,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S,Mr
2,89,1,"Davidson, Mr. Thornton",male,31.0,1,0,F.C. 12750,52.0,B71,S,Mr
3,1063,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.125,,S,Mr
4,1020,3,"Minkoff, Mr. Lazar",male,21.0,0,0,349211,7.8958,,S,Mr


In [33]:
tittle_age_median = data3.groupby('Tittle').age.transform('median')
data3.age.fillna(tittle_age_median, inplace = True)

In [34]:
tittle_age_median = data2.groupby('Tittle').age.transform('median')
data2.age.fillna(tittle_age_median, inplace = True)

In [35]:
data3.isnull().sum()

passenger_id      0
pclass            0
name              0
sex               0
age               0
sibsp             0
parch             0
ticket            0
fare              0
cabin           355
embarked          0
Tittle            0
dtype: int64

In [36]:
data2.cabin.value_counts()

G6                 4
C22 C26            4
B57 B59 B63 B66    4
D                  4
B96 B98            4
                  ..
F4                 1
B52 B54 B56        1
D56                1
B26                1
D38                1
Name: cabin, Length: 135, dtype: int64

In [37]:
data3.drop(['cabin'], axis = 1)

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,Tittle
0,295,1,"Thayer, Mr. John Borland Jr",male,17.0,0,2,17421,110.8833,C,Mr
1,1150,3,"Risien, Mr. Samuel Beard",male,30.0,0,0,364498,14.5000,S,Mr
2,89,1,"Davidson, Mr. Thornton",male,31.0,1,0,F.C. 12750,52.0000,S,Mr
3,1063,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.1250,S,Mr
4,1020,3,"Minkoff, Mr. Lazar",male,21.0,0,0,349211,7.8958,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...
454,1194,3,"Sdycoff, Mr. Todor",male,30.0,0,0,349222,7.8958,S,Mr
455,403,2,"Eitemiller, Mr. George Floyd",male,23.0,0,0,29751,13.0000,S,Mr
456,108,1,"Fleming, Miss. Margaret",female,21.0,0,0,17421,110.8833,C,Miss
457,510,2,"Mudd, Mr. Thomas Charles",male,16.0,0,0,S.O./P.P. 3,10.5000,S,Mr


In [38]:
Embarked = pd.get_dummies(data2['embarked'],drop_first = True)

In [39]:
Sex = pd.get_dummies(data2['sex'],drop_first = True)

In [40]:
train = pd.concat([data2, Sex, Embarked], axis = 1)

In [41]:
train.head()

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived,Tittle,male,Q,S,c
0,1216,3,"Smyth, Miss. Julia",female,22.0,0,0,335432,7.7333,,Q,1,Miss,0,1,0,0
1,699,3,"Cacic, Mr. Luka",male,38.0,0,0,315089,8.6625,,S,0,Mr,1,0,1,0
2,1267,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",female,30.0,1,1,345773,24.15,,S,0,Mrs,0,0,1,0
3,449,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0,,S,1,Mrs,0,0,1,0
4,576,2,"Veal, Mr. James",male,40.0,0,0,28221,13.0,,S,0,Mr,1,0,1,0


In [42]:
# Droping unnecessary colums after making appropriate changes to the data
train = train.drop(['passenger_id', 'name', 'sex', 'ticket', 'cabin', 'embarked', 'Tittle'], axis = 1)

In [43]:
# Data is now ready to run predictive models
data3.head()

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,Tittle
0,295,1,"Thayer, Mr. John Borland Jr",male,17.0,0,2,17421,110.8833,C70,C,Mr
1,1150,3,"Risien, Mr. Samuel Beard",male,30.0,0,0,364498,14.5,,S,Mr
2,89,1,"Davidson, Mr. Thornton",male,31.0,1,0,F.C. 12750,52.0,B71,S,Mr
3,1063,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.125,,S,Mr
4,1020,3,"Minkoff, Mr. Lazar",male,21.0,0,0,349211,7.8958,,S,Mr


In [44]:
Sex_test = pd.get_dummies(data3['sex'],drop_first = True)

In [45]:
Embarked_test = pd.get_dummies(data3['embarked'],drop_first = True)

In [46]:
test = pd.concat([data3, Sex_test, Embarked_test], axis = 1)

In [47]:
test.head()

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,Tittle,male,Q,S,c
0,295,1,"Thayer, Mr. John Borland Jr",male,17.0,0,2,17421,110.8833,C70,C,Mr,1,0,0,0
1,1150,3,"Risien, Mr. Samuel Beard",male,30.0,0,0,364498,14.5,,S,Mr,1,0,1,0
2,89,1,"Davidson, Mr. Thornton",male,31.0,1,0,F.C. 12750,52.0,B71,S,Mr,1,0,1,0
3,1063,3,"Nirva, Mr. Iisakki Antino Aijo",male,41.0,0,0,SOTON/O2 3101272,7.125,,S,Mr,1,0,1,0
4,1020,3,"Minkoff, Mr. Lazar",male,21.0,0,0,349211,7.8958,,S,Mr,1,0,1,0


In [48]:
x = train.drop(['survived'], axis = 1)

In [49]:
x.head()
y = train['survived']

In [50]:
y.head()

0    1
1    0
2    0
3    1
4    0
Name: survived, dtype: int64

# Logistic Regression

In [51]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 42)
Log_reg = LogisticRegression()
Log_reg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [52]:
predicts = Log_reg.predict(X_test)

In [53]:
accuracy = confusion_matrix(y_test,predicts)

In [54]:
accuracy

array([[127,  30],
       [ 31,  67]], dtype=int64)

In [55]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [60]:
predicts

array([0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1], dtype=int64)

In [59]:
accuracy = accuracy_score(y_test,predicts)
print('The accuracy of the model with Logistic Regression is ',accuracy)

The accuracy of the model with Logistic Regression is  0.7607843137254902
