##Logistic Regression

In [28]:

# Logistic Regression on Dataset
import pandas as pd
from random import seed
from random import randrange
from csv import reader
from math import exp


###Function Building

In [29]:

# Load a CSV file
def load_csv(filename):
  file = open(filename, "r")
  lines = reader(file)
  dataset = list(lines)
  return dataset


In [30]:
# Load dataset
filename = 'titanic.csv'
dataset = load_csv(filename)
print('Loaded data file {0} with {1} rows and {2} columns'.format(filename, len(dataset),
len(dataset[0])))

Loaded data file titanic.csv with 712 rows and 11 columns


In [31]:
# Convert string column to float
def str_column_to_float(dataset, column):
  for row in dataset:
    row[column] = float(row[column].strip())

In [32]:
# Find the min and max values for each column
def dataset_minmax(dataset):
  minmax = list()
  for i in range(len(dataset[0])):
    col_values = [row[i] for row in dataset]
    value_min = min(col_values)
    value_max = max(col_values)
    minmax.append([value_min, value_max])
  return minmax

In [33]:

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
  for row in dataset:
    for i in range(len(row)):
     row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

In [34]:

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
  dataset_split = list()
  dataset_copy = list(dataset)
  fold_size = int(len(dataset) / n_folds)
  for _ in range(n_folds):
    fold = list()
    while len(fold) < fold_size:
     index = randrange(len(dataset_copy))
     fold.append(dataset_copy.pop(index))
    dataset_split.append(fold)
  return dataset_split

In [35]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
  correct = 0
  for i in range(len(actual)):
    if actual[i] == predicted[i]:
     correct += 1
  return correct / float(len(actual)) * 100.0

In [36]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
  folds = cross_validation_split(dataset, n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set = list()
    for row in fold:
     row_copy = list(row)
     test_set.append(row_copy)
     row_copy[-1] = None
    predicted = algorithm(train_set, test_set, *args)
    actual = [row[-1] for row in fold]
    accuracy = accuracy_metric(actual, predicted)
    scores.append(accuracy)
  return scores


In [37]:
# Make a prediction with coefficients
def predict(row, coefficients):
  yhat = coefficients[0]
  for i in range(len(row)-1):
    yhat += coefficients[i + 1] * row[i]
  return 1.0 / (1.0 + exp(-yhat))

In [38]:
# Estimate logistic regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
  coef = [0.0 for i in range(len(train[0]))]
  for _ in range(n_epoch):
    for row in train:
     yhat = predict(row, coef)
     error = row[-1] - yhat
     coef[0] = coef[0] + l_rate * error * yhat * (1.0 - yhat)
     for i in range(len(row)-1):
       coef[i + 1] = coef[i + 1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
  return coef

In [39]:
# Logistic Regression Algorithm With Stochastic Gradient Descent
def logistic_regression(train, test, l_rate, n_epoch):
  predictions = list()
  coef = coefficients_sgd(train, l_rate, n_epoch)
  for row in test:
    yhat = predict(row, coef)
    yhat = round(yhat)
    predictions.append(yhat)
  return(predictions)

###Testing and Evaluating Algorithm

In [40]:
# Test the logistic regression algorithm on the dataset
seed(1)
# load and prepare data
filename = 'titanic.csv' 
dataset = load_csv(filename)
for i in range(len(dataset[0])):
  str_column_to_float(dataset, i)
# normalize
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 5
l_rate = 0.0001
n_epoch = 1000
scores = evaluate_algorithm(dataset, logistic_regression, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))


Scores: [73.23943661971832, 80.28169014084507, 75.35211267605634, 78.87323943661971, 81.69014084507043]
Mean Accuracy: 77.887%


##Support Vector Machine

In [41]:
# importing numpy library
import numpy as np

In [42]:
class SVM_classifier():


  # initiating the hyperparameters
  def __init__(self, learning_rate, no_of_iterations, lambda_parameter):

    self.learning_rate = learning_rate
    self.no_of_iterations = no_of_iterations
    self.lambda_parameter = lambda_parameter


  
  # fitting the dataset to SVM Classifier
  def fit(self, X, Y):

    # m  --> number of Data points --> number of rows
    # n  --> number of input features --> number of columns
    self.m, self.n = X.shape

    # initiating the weight value and bias value

    self.w = np.zeros(self.n)

    self.b = 0

    self.X = X

    self.Y = Y

    # implementing Gradient Descent algorithm for Optimization

    for i in range(self.no_of_iterations):
      self.update_weights()



  # function for updating the weight and bias value
  def update_weights(self):

    # label encoding
    y_label = np.where(self.Y <= 0, -1, 1)



    # gradients ( dw, db)
    for index, x_i in enumerate(self.X):

      condition = y_label[index] * (np.dot(x_i, self.w) - self.b) >= 1

      if (condition == True):

        dw = 2 * self.lambda_parameter * self.w
        db = 0

      else:

        dw = 2 * self.lambda_parameter * self.w - np.dot(x_i, y_label[index])
        db = y_label[index]


      self.w = self.w - self.learning_rate * dw

      self.b = self.b - self.learning_rate * db



  # predict the label for a given input value
  def predict(self, X):

    output = np.dot(X, self.w) - self.b
    
    predicted_labels = np.sign(output)

    y_hat = np.where(predicted_labels <= -1, 0, 1)

    return y_hat  


In [43]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [44]:
# loading the data from csv file to pandas dataframe
df= pd.read_csv('/content/titanic_real.csv')

In [45]:
# print the first 5 rows of the dataframe
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [46]:
# number of rows and columns in the dataset
df.shape

(891, 12)

In [47]:
# getting the statistical measures of the dataset
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [48]:
df['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [49]:
df = df.drop(['Name', 'Ticket','Cabin'], axis=1)

In [50]:
df.dropna()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
885,886,0,3,female,39.0,0,5,29.1250,Q
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [51]:

# encoding 'Sex' column
df.replace({'Sex':{'female':0,'male':1}},inplace=True)

# encoding 'Embarked' column
df.replace({'Embarked':{'C':0,'Q':1,'S':2}},inplace=True)

0 --> Not survived

1 --> Survived

In [52]:
# separating the features and target

features = df.drop(columns='Survived', axis=1)

target = df['Survived']


In [53]:
print(features)

     PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0              1       3    1  22.0      1      0   7.2500       2.0
1              2       1    0  38.0      1      0  71.2833       0.0
2              3       3    0  26.0      0      0   7.9250       2.0
3              4       1    0  35.0      1      0  53.1000       2.0
4              5       3    1  35.0      0      0   8.0500       2.0
..           ...     ...  ...   ...    ...    ...      ...       ...
886          887       2    1  27.0      0      0  13.0000       2.0
887          888       1    0  19.0      0      0  30.0000       2.0
888          889       3    0   NaN      1      2  23.4500       2.0
889          890       1    1  26.0      0      0  30.0000       0.0
890          891       3    1  32.0      0      0   7.7500       1.0

[891 rows x 8 columns]


In [54]:
print(target)

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


###Data Standardization

In [55]:
scaler = StandardScaler()

In [56]:
scaler.fit(features)

StandardScaler()

In [57]:
standardized_data = scaler.transform(features)

In [58]:
print(standardized_data)

[[-1.73010796  0.82737724  0.73769513 ... -0.47367361 -0.50244517
   0.58683958]
 [-1.72622007 -1.56610693 -1.35557354 ... -0.47367361  0.78684529
  -1.93955453]
 [-1.72233219  0.82737724 -1.35557354 ... -0.47367361 -0.48885426
   0.58683958]
 ...
 [ 1.72233219  0.82737724 -1.35557354 ...  2.00893337 -0.17626324
   0.58683958]
 [ 1.72622007 -1.56610693  0.73769513 ... -0.47367361 -0.04438104
  -1.93955453]
 [ 1.73010796  0.82737724  0.73769513 ... -0.47367361 -0.49237783
  -0.67635748]]


In [59]:
features = standardized_data
target = df['Survived']

In [60]:
print(features)
print(target)

[[-1.73010796  0.82737724  0.73769513 ... -0.47367361 -0.50244517
   0.58683958]
 [-1.72622007 -1.56610693 -1.35557354 ... -0.47367361  0.78684529
  -1.93955453]
 [-1.72233219  0.82737724 -1.35557354 ... -0.47367361 -0.48885426
   0.58683958]
 ...
 [ 1.72233219  0.82737724 -1.35557354 ...  2.00893337 -0.17626324
   0.58683958]
 [ 1.72622007 -1.56610693  0.73769513 ... -0.47367361 -0.04438104
  -1.93955453]
 [ 1.73010796  0.82737724  0.73769513 ... -0.47367361 -0.49237783
  -0.67635748]]
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


###Test Train Split

In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.2, random_state = 2)

In [62]:
print(features.shape, X_train.shape, X_test.shape)

(891, 8) (712, 8) (179, 8)


###Training Model

In [63]:
classifier = SVM_classifier(learning_rate=0.0001, no_of_iterations=1000, lambda_parameter=0.01)

In [64]:
# training the SVM classifier with training data
classifier.fit(X_train, Y_train)

###Model Evaluation

In [65]:
# accuracy on training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [66]:
print('Accuracy score on training data = ', training_data_accuracy)

Accuracy score on training data =  0.3693820224719101


In [67]:
# accuracy on training data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [68]:
print('Accuracy score on test data = ', test_data_accuracy)

Accuracy score on test data =  0.441340782122905


###Building Predictive System

In [69]:
input_data = (1,3,1,22.0,1,0,7,2)

# change the input data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardizing the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person has not survived')

else:
  print('The person has survived')

[[-1.73010796  0.82737724  0.73769513 -0.53037664  0.43279337 -0.47367361
  -0.50747884  0.58683958]]
[1]
The person has survived


