# Explore here

In [26]:
#import libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
#pull in the dataframe
train_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/machine-learning-content/master/assets/titanic_train.csv")
test_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/machine-learning-content/master/assets/titanic_test.csv")
test_survived_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/machine-learning-content/master/assets/gender_submission.csv")
test_data["Survived"] = test_survived_data["Survived"]

total_data = pd.concat([train_data, test_data]).reset_index(inplace = False)
total_data.drop(columns = ["index"], inplace = True)
total_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Feature Engineering, condensed to one cell:

In [28]:
#converted these to categorical variables
total_data['Survived'] = total_data['Survived'].astype('category')
total_data['Pclass'] = total_data['Pclass'].astype('category')

total_data = total_data.drop(columns=['Cabin'])

total_data['Age'] = total_data['Age'].fillna(total_data['Age'].median())

total_data.dropna(inplace=True)

total_data.reset_index(drop=True, inplace=True)

total_data.drop(columns=['Name', 'Ticket', 'Parch', 'PassengerId'], inplace=True)

total_data = pd.get_dummies(total_data, dtype='int', drop_first=True)

## Build a Classification Model

1. Split the data into Predictors (X) and target (y)
2. Split the data into Training and Testing sets
3. Train a Logistic Regression model on the training datasets
4. Make predictions using the testing dataset

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [30]:
#set up targety y and predictor X variables
y = total_data['Survived_1']
X = total_data.drop(columns=['Survived_1'])

#split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [31]:
model = LogisticRegression()

model.fit(X_train,y_train)

train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

In [34]:
accuracy_score(y_train, train_preds)

0.8553639846743295

In [36]:
accuracy_score(y_test, test_preds)

0.8473282442748091

In [37]:
confusion_matrix(y_train, train_preds)

array([[578,  63],
       [ 88, 315]])

### How do we get th True Positives, False Positives, etc?

In [41]:
tn, fp, fn, tp = confusion_matrix(y_train, train_preds).ravel()

In [50]:
#manual Calculations
tn, fp, fn, tp = confusion_matrix(y_train, train_preds).ravel()

acc = (tn + tp) / (tn + fp + fn + tp)
precision = tp/ (fp + tp)
recall = tp / (fn + tp)

print(f'Accuracy Score: {acc}')
print(f'Precision Score: {precision}')
print(f'Recall Score: {recall}')

Accuracy Score: 0.8553639846743295
Precision Score: 0.8333333333333334
Recall Score: 0.7816377171215881


In [51]:
#verify using the classification report

print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88       173
           1       0.77      0.79      0.78        89

    accuracy                           0.85       262
   macro avg       0.83      0.83      0.83       262
weighted avg       0.85      0.85      0.85       262

