# Logistic regrssion Exercise

Create a Logistic Regression model that predicts which passengers survived the sinking of the Titanic, based on features like age and class.

Data source: [Kaggle](https://www.kaggle.com/c/titanic)

```py

import codecademylib3_seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the passenger data
passengers = pd.read_csv('passengers.csv')
# print(passengers.head())

# Update sex column to numerical, female == 1, male == 0
passengers['Sex'] = passengers.Sex.map({'male': 0, 'female': 1})
# print(passengers.head())

# Fill the nan values in the age column with the mean age inplace, modify 'Age' directly, do not return new column
# print(passengers['Age'].values) # output age values as a list
passengers['Age'].fillna(inplace=True, value=round(passengers.Age.mean()))
# print(passengers.Age.values)
      
# Create a first class column, 1 if the passenger is in Pclass == 1, 0 otherwise
passengers['FirstClass'] = passengers.Pclass.apply(lambda p: 1 if p == 1 else 0)

# Create a second class column, stores 1 for all passengers in second class and 0 for all other passengers.
passengers['SecondClass'] = passengers.Pclass.apply(lambda p: 1 if p == 2 else 0)
print(passengers)

# Select the desired features
# Select columns Sex, Age, FirstClass, and SecondClass and store them in a variable named features.
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']] # raw data

# Select column Survived and store it a variable named survival.
survival = passengers['Survived'] # answers/labels

# Perform train, test, split
train_features, test_features, train_labels, test_labels = train_test_split(features, survival)

# Scale the feature data so it has mean = 0 and standard deviation = 1. Since sklearn's Logistic Regression implementation uses Regularization, we need to normalize our data
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)

# Create and train the model to recognise which passengers survived and which didn't based on our four features
model = LogisticRegression()
model.fit(train_features_scaled, train_labels)

# Score the model on the train data, the score returned is the percentage of correct classifications, or the accuracy.
print(model.score(train_features_scaled, train_labels))
# 0.7979041916167665 => 80% accuracy rate

# Score the model on the test data
print(model.score(test_features_scaled, test_labels))
# 0.8026905829596412
 
# Analyze the coefficients, how significant were each of the 4 features in determinig whether someone would survive
print(model.coef_)
# [[ 1.24283291 -0.48829518  1.03291324  0.54043375]]
# Gender and FirstClass were the most significant factors

# Sample passenger features, array order, 'Sex' (0 == male, 1 == female), 'Age', 'FirstClass', 'SecondClass' - all values must be floats
Jack = np.array([0.0,20.0,0.0,0.0]) # 3rd class
Rose = np.array([1.0,17.0,1.0,0.0]) # 1st class
You = np.array([1.0,48.0,0.0,0.0]) # 2nd class

# Combine passenger arrays as an NumPy array
sample_passengers = np.array([Jack, Rose, You])

# Since our Logistic Regression model was trained on scaled feature data, we must also scale the feature data we are making predictions on.
sample_passengers_scaled = scaler.transform(sample_passengers)
print(sample_passengers_scaled)
# [[-0.71267151 -0.73013432 -0.56813051 -0.51492865]
#  [ 1.40317101 -0.95824651  1.76015894 -0.51492865]
#  [-0.71267151  1.39891283 -0.56813051  1.94201662]]
print('--------------------------------------------')
# Make survival predictions!
print(model.predict(sample_passengers_scaled))
# [0 1 0] => only Rose made it!

#  Call your model's .predict_proba() to see the probabilities that led to these predictions. The 1st column is the probability of a passenger perishing on the Titanic, and the 2nd column is the probability of a passenger surviving the sinking
print(model.predict_proba(sample_passengers_scaled))
# [[0.88988293 0.11011707]
#  [0.06308681 0.93691319]
#  [0.83347911 0.16652089]]

```