# Modeling after Preprocessing

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE


In [None]:
#Loading X_train, y_train, X_test, y_test from .csv creating in preprocessing
X_train = pd.read_csv('../raw_data/X_train_preproc')
y_train = pd.read_csv('../raw_data/y_train_preproc')
X_test = pd.read_csv('../raw_data/X_test_preproc')
y_test = pd.read_csv('../raw_data/y_test_preproc')

In [None]:
#Setting baseline score as % of outs (Class 0)
baseline_score = y_train.value_counts(normalize=True)[0]

In [None]:
#Utilizing the SMOTE method to resample X_train, y_train to balance the data set
X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)

In [None]:
# Checking the rebalancing was correct in terms of y_target split
print(X_train.shape, y_train.shape, y_train.value_counts(normalize=True))
print(X_train_resampled.shape, y_train_resampled.shape, y_train.value_counts(normalize=True))

In [None]:
#Instantiating basic Logisitic Regression model and fitting to balanced data
log_model = LogisticRegression()
log_model.fit(X_train_resampled, y_train_resampled)

In [None]:
#Feature Permutation
permutation_score = permutation_importance(log_model, X_train_resampled, y_train_resampled, n_repeats=25)

importance_df = pd.DataFrame(np.vstack((X_train_resampled.columns,
                                        permutation_score.importances_mean)).T) 
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False) 