### Reading data from csv using pandas

In [27]:
import pandas as pd
data = pd.read_csv("train.csv")

In [28]:
#Priniting data

data

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0000,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.0500,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0000,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.0750,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5000,,S,1
664,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.7500,,Q,0
665,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S,1
666,3,"Strandberg, Miss. Ida Sofia",female,22.0,0,0,7553,9.8375,,S,0


### Creating a copy of data for all operations to avoid changes in main data

In [29]:
df= data.copy()

### Check number of NULL enteries in data

In [30]:
df.isnull().sum()

Pclass        0
Name          0
Sex           0
Age         132
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       514
Embarked      1
Survived      0
dtype: int64

### Dropping extra columns

In [31]:
# Dropping Name column as it adds no value to result, similarly ticket number
# Dropping cabin as most of the fields are empty

df.drop(['Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,2,female,29.0,1,0,26.0,S,1
1,3,male,,0,0,8.05,S,0
2,2,male,39.0,0,0,26.0,S,0
3,3,female,29.0,0,4,21.075,S,0
4,3,male,25.0,0,0,7.05,S,0


### Converting string values to integer

In [32]:
genders = {"male": 0, "female": 1}
data1 = [df]

for dataset in data1:
    dataset['Sex'] = dataset['Sex'].map(genders)
    
    
embarked = {"S": 0, "Q": 1, "C": 2}
data11 = [df]

for dataset in data11:
    dataset['Embarked'] = dataset['Embarked'].map(embarked)


In [33]:
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,2,1,29.0,1,0,26.0,0.0,1
1,3,0,,0,0,8.05,0.0,0
2,2,0,39.0,0,0,26.0,0.0,0
3,3,1,29.0,0,4,21.075,0.0,0
4,3,0,25.0,0,0,7.05,0.0,0


In [34]:
df.isnull().sum()

Pclass        0
Sex           0
Age         132
SibSp         0
Parch         0
Fare          0
Embarked      1
Survived      0
dtype: int64

### Removing NULL values by inserting mean values in age and 0 in Embarked column

In [35]:
mean = df["Age"].mean()
std = df["Age"].std() 
df.Age.fillna(mean,inplace=True)
df.Embarked.fillna(0,inplace=True)

In [36]:
df.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64

In [37]:
df.shape

(668, 8)

### Divide data into X and Y

In [38]:
x1= df.iloc[:,:7]
y1= df.iloc[:,7:8]
print(x1.shape,y1.shape)

(668, 7) (668, 1)


### Preprocess data using Standard Scaler

In [39]:
from sklearn import preprocessing as pp

In [40]:
scaler=pp.StandardScaler()
scaler.fit(x1)
x1=scaler.transform(x1)
x1

array([[-3.56680918e-01,  1.33108386e+00, -5.49716380e-02, ...,
        -4.76767618e-01, -1.33914039e-01, -5.87892987e-01],
       [ 8.46666826e-01, -7.51267469e-01,  2.78774942e-16, ...,
        -4.76767618e-01, -5.30275902e-01, -5.87892987e-01],
       [-3.56680918e-01, -7.51267469e-01,  7.29710065e-01, ...,
        -4.76767618e-01, -1.33914039e-01, -5.87892987e-01],
       ...,
       [ 8.46666826e-01, -7.51267469e-01,  1.80432873e-01, ...,
        -4.76767618e-01,  5.39477166e-01, -5.87892987e-01],
       [ 8.46666826e-01,  1.33108386e+00, -6.04248830e-01, ...,
        -4.76767618e-01, -4.90805327e-01, -5.87892987e-01],
       [ 8.46666826e-01,  1.33108386e+00,  2.78774942e-16, ...,
        -4.76767618e-01, -3.65769168e-01,  6.54868137e-01]])

### Divide data into test and train

In [41]:
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(x1,y1)
print(X_train.shape, Y_test.shape)

(501, 7) (167, 1)


### Use inbuilt Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

In [43]:
clf = LogisticRegression(C = 2)
clf.fit(X_train, Y_train['Survived'])

In [44]:
Y_pred = clf.predict(X_test)
Y_pred

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

### Score of model

In [45]:
clf.score(X_test, Y_test)

0.7844311377245509

In [46]:
check = Y_pred - Y_test['Survived']
print(check.shape)
check

(167,)


101   -1
503    0
119    0
94     0
504   -1
      ..
67     0
55     0
531    0
343   -1
185    0
Name: Survived, Length: 167, dtype: int64

In [47]:
print("Number of 0 i.e. correct predictions = ", check[check == 0].count())
print("Number of 1 i.e. wrong predictions = ", check[check == 1].count())
print("Number of -1 i.e. wrong predictions = ", check[check == -1].count())

Number of 0 i.e. correct predictions =  131
Number of 1 i.e. wrong predictions =  13
Number of -1 i.e. wrong predictions =  23
