In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import multiprocessing

%matplotlib inline

np.random.seed(42)

## Perform Preprocessing

In [17]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
y_train = train_data['Survived']
train_data.drop(['Survived', 'Cabin', 'Ticket', 'Name'], axis=1, inplace=True)

In [18]:
#Reset index to PassengerId
train_data.set_index('PassengerId', inplace=True)

In [19]:
#Create list of attributes for preprocessing pipeline
num_attribs = ['Parch', 'Age', 'SibSp', 'Fare']
cat_attribs = ['Pclass', 'Embarked', 'Sex']

In [20]:
#Create pipeline for preprocessing
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler()),   
])

cat_pipeline = Pipeline([
    ("one_hot_encoder", OneHotEncoder()),
    ("imputer", SimpleImputer(strategy="median")),
])

In [21]:
#Create preprocessing pipeline
preprocess_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

X_train = preprocess_pipeline.fit_transform(train_data[num_attribs + cat_attribs])

X_train





array([[-0.47367361, -0.56573646,  0.43279337, ...,  0.        ,
         0.        ,  1.        ],
       [-0.47367361,  0.66386103,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [-0.47367361, -0.25833709, -0.4745452 , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 2.00893337, -0.1046374 ,  0.43279337, ...,  0.        ,
         1.        ,  0.        ],
       [-0.47367361, -0.25833709, -0.4745452 , ...,  0.        ,
         0.        ,  1.        ],
       [-0.47367361,  0.20276197, -0.4745452 , ...,  0.        ,
         0.        ,  1.        ]])

In [22]:
# pd.to_pickle(X_train, 'data/X_train.pkl')
# pd.to_pickle(y_train, 'data/y_train.pkl')


_______________________________________________________________________________________________________________________________

## Build Models

In [23]:
#Import cross validation and optimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#Import models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from multiprocessing import cpu_count

In [24]:
#Import warnings
import warnings

#Ignore warnings
warnings.filterwarnings('ignore')

In [25]:
#Experiment tracking
import mlflow
import mlflow.sklearn
import mlflow.xgboost

In [26]:


#Begin logging
mlflow.sklearn.autolog()

### Random Forest Classifier

In [27]:
#Set Experiment name
mlflow.set_experiment('Random Forest Classifier')

<Experiment: artifact_location='file:///home/ty/code/data_science/analysis/Titanic-Analysis/mlruns/431820994769440421', creation_time=1688500931603, experiment_id='431820994769440421', last_update_time=1688500931603, lifecycle_stage='active', name='Random Forest Classifier', tags={}>

In [33]:
mlflow.set_tag('Random Forest Classifier', 'Random Forest Classifier')
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_train)


#Set run name
mlflow.set_tag('run_1', 'RandomForestClassifier')



### Gradient Boosting Classifier

In [29]:
#Create Gradient Boosting Classifier
# gb_clf = GradientBoostingClassifier()

# #Fit model
# gb_clf.fit(X_train, y_train)

# #Predict on training set
# y_pred = gb_clf.predict(X_train)

### SGD Classifier

In [30]:
# #Make SGD Classifier
# sgd_clf = SGDClassifier()

# #Fit model
# sgd_clf.fit(X_train, y_train)

# #Predict on training set
# y_pred = sgd_clf.predict(X_train)
