Titanic dataset is one of the datasets available in sklearn.
You are given:

1. A Training dataset csv file with X train and Y train data

2. A X test File and you have to predict and submit predictions for this file.

Your task is to:

1. Use Logistic Regression and come with predictions.

Read Instructions carefully -

1. Use Logistic Regression as a training algorithm and submit results predicted.

2. Files are in csv format.

3. Submit a csv file with only predictions for X test data. File should not have any headers and should only have one column i.e. predictions.

4. Your score is based on number of accurate predictions.

In [1]:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
import numpy as np 
import pandas as pd 

In [10]:
train_data = pd.read_csv("train_titanic.csv") 
test_data = pd.read_csv("test_titanic.csv") 
train_data.shape , test_data.shape

((668, 11), (223, 10))

In [25]:
print("------ TRAIN DATA NULL VALUES ------")
print(train_data.isnull().sum() )
print()
print("------ TEST DATA NULL VALUES ------")
print(test_data.isnull().sum())

------ TRAIN DATA NULL VALUES ------
Pclass        0
Name          0
Sex           0
Age         132
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       514
Embarked      1
Survived      0
dtype: int64

------ TEST DATA NULL VALUES ------
Pclass        0
Name          0
Sex           0
Age          45
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       173
Embarked      1
dtype: int64


REMOVING AND ADJUSTING THE VALUES SO THAT DATASET BECOMES CONSISTENT

In [90]:
train = train_data.copy() 
test = test_data.copy() 

def clean(points):
    points['Age'].fillna(points["Age"].median(skipna = True) , inplace = True)
    points["Embarked"].fillna(points["Embarked"].value_counts().idxmax(), inplace = True )
    points.drop("Cabin" , axis = 1, inplace = True)
    
    # WE DONT WANT NAME , SEX , TICKET 
    points.drop("Name" , axis = 1, inplace = True)
    points.drop("Ticket" , axis = 1, inplace = True)
    
    # CHANGING FEATURES BASED ON SEX , EMBARKED , PCLASS 
    points = pd.get_dummies(points , columns = ["Pclass" , "Embarked" , "Sex"])
    

    points['IsMinor'] = np.where(points['Age'] <= 16, 1, 0)
    points['TravelAlone'] = np.where((points["SibSp"] + points["Parch"]) > 0, 0, 1)

    points.drop('SibSp', axis = 1, inplace = True)
    points.drop('Parch', axis = 1, inplace = True)

    points.drop('Sex_female', axis = 1, inplace = True)
    
    return points 

test = clean(test) 
train = clean(train)
train.shape , test.shape
print(train.head())
print(test.head())


y_train = train["Survived"] 
train.drop("Survived" , axis = 1 ,inplace = True)

    Age    Fare  Survived  Pclass_1  Pclass_2  Pclass_3  Embarked_C  \
0  29.0  26.000         1         0         1         0           0   
1  29.0   8.050         0         0         0         1           0   
2  39.0  26.000         0         0         1         0           0   
3  29.0  21.075         0         0         0         1           0   
4  25.0   7.050         0         0         0         1           0   

   Embarked_Q  Embarked_S  Sex_male  IsMinor  TravelAlone  
0           0           1         0        0            0  
1           0           1         1        0            1  
2           0           1         1        0            1  
3           0           1         0        0            0  
4           0           1         1        0            1  
    Age     Fare  Pclass_1  Pclass_2  Pclass_3  Embarked_C  Embarked_Q  \
0   8.0  36.7500         0         1         0           0           0   
1  49.0  25.9292         1         0         0           0       

In [96]:

x_train = train 

y_train.shape , x_train.shape

print(x_train.head())

    Age    Fare  Pclass_1  Pclass_2  Pclass_3  Embarked_C  Embarked_Q  \
0  29.0  26.000         0         1         0           0           0   
1  29.0   8.050         0         0         1           0           0   
2  39.0  26.000         0         1         0           0           0   
3  29.0  21.075         0         0         1           0           0   
4  25.0   7.050         0         0         1           0           0   

   Embarked_S  Sex_male  IsMinor  TravelAlone  
0           1         0        0            0  
1           1         1        0            1  
2           1         1        0            1  
3           1         0        0            0  
4           1         1        0            1  


In [97]:
train = np.array(x_train)
test = np.array(test)
train.shape , test.shape

((668, 11), (223, 11))

In [98]:
x_train , y_train

(      Age     Fare  Pclass_1  Pclass_2  Pclass_3  Embarked_C  Embarked_Q  \
 0    29.0  26.0000         0         1         0           0           0   
 1    29.0   8.0500         0         0         1           0           0   
 2    39.0  26.0000         0         1         0           0           0   
 3    29.0  21.0750         0         0         1           0           0   
 4    25.0   7.0500         0         0         1           0           0   
 ..    ...      ...       ...       ...       ...         ...         ...   
 663  17.0  10.5000         0         1         0           0           0   
 664  29.0   7.7500         0         0         1           0           1   
 665  32.0  56.4958         0         0         1           0           0   
 666  22.0   9.8375         0         0         1           0           0   
 667  29.0  15.5000         0         0         1           0           1   
 
      Embarked_S  Sex_male  IsMinor  TravelAlone  
 0             1       

In [99]:
clf = LogisticRegression(solver = 'liblinear' , max_iter = 10000 , penalty = 'l2')
clf.fit(x_train , y_train)

LogisticRegression(max_iter=10000, solver='liblinear')

In [100]:
y_pred = clf.predict(test)
y_pred

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0], dtype=int64)

In [104]:
clf.score(x_train , y_train) , clf.score(test , y_pred)

(0.7964071856287425, 1.0)

In [105]:
clf.predict_proba(test)

array([[0.53581579, 0.46418421],
       [0.14290154, 0.85709846],
       [0.8534993 , 0.1465007 ],
       [0.22338453, 0.77661547],
       [0.58445499, 0.41554501],
       [0.62938649, 0.37061351],
       [0.79314348, 0.20685652],
       [0.85349947, 0.14650053],
       [0.42017493, 0.57982507],
       [0.90408668, 0.09591332],
       [0.21074789, 0.78925211],
       [0.90409382, 0.09590618],
       [0.05798977, 0.94201023],
       [0.44432234, 0.55567766],
       [0.62975808, 0.37024192],
       [0.74743966, 0.25256034],
       [0.89334609, 0.10665391],
       [0.78726313, 0.21273687],
       [0.89521653, 0.10478347],
       [0.30933919, 0.69066081],
       [0.39485382, 0.60514618],
       [0.86538394, 0.13461606],
       [0.16599474, 0.83400526],
       [0.22267798, 0.77732202],
       [0.90409982, 0.09590018],
       [0.54065356, 0.45934644],
       [0.73535699, 0.26464301],
       [0.65867452, 0.34132548],
       [0.16207854, 0.83792146],
       [0.28224153, 0.71775847],
       [0.

In [106]:
predictions = np.savetxt("prediction.csv" , y_pred , delimiter = ",")


In [107]:
pred = np.genfromtxt("prediction.csv" , delimiter = ",")
pred

array([0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
       1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1.,
       0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1.,
       1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1.,
       0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0.,
       0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
       0., 0.])