# Preliminary Model Build for Mockup

In [1]:
# Import Warnings and Dependencies
import warnings
warnings.filterwarnings('ignore')
# Import Dependencies
import pandas as pd
import numpy as np
import matplotlib
from pathlib import Path
from collections import Counter



In [2]:
# import sklearn functions as needed 2022.11.1 Load then remove
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

### Data to Load

In [3]:
# Load in Data
file_path = Path("..\Data\Model_Data\MSA_Labor_Force_Unemployment_Cleaned.csv") 
# Replace this with summary data from SQL Database and drive the connection to 
housing_data_df = pd.read_csv(file_path)
housing_data_df.head(10)
# ? Drop any Null Values or Rows or Replace ? Cleaning should be done with SQL or do we want to manipulate here ?


Unnamed: 0,Region ID,Year,Month,Civilian Labor Force,Employment,Unemployment,Unemployment Rate
0,394355,2018,3,1179232,1143501,35731,3.0
1,394355,2018,4,1183111,1147281,35830,3.0
2,394355,2018,5,1185971,1150176,35795,3.0
3,394355,2018,6,1188164,1152543,35621,3.0
4,394355,2018,7,1190096,1154811,35285,3.0
5,394355,2018,8,1192227,1157195,35032,2.9
6,394355,2018,9,1194833,1159797,35036,2.9
7,394355,2018,10,1197988,1162774,35214,2.9
8,394355,2018,11,1201566,1166183,35383,2.9
9,394355,2018,12,1205099,1169731,35368,2.9


### Split the Data Into Training and Testing

In [4]:
# Create our features
y = housing_data_df["Unemployment Rate"]
X = housing_data_df.drop(columns="Unemployment Rate") # TBD based on factors to pull in
X.head(10)

Unnamed: 0,Region ID,Year,Month,Civilian Labor Force,Employment,Unemployment
0,394355,2018,3,1179232,1143501,35731
1,394355,2018,4,1183111,1147281,35830
2,394355,2018,5,1185971,1150176,35795
3,394355,2018,6,1188164,1152543,35621
4,394355,2018,7,1190096,1154811,35285
5,394355,2018,8,1192227,1157195,35032
6,394355,2018,9,1194833,1159797,35036
7,394355,2018,10,1197988,1162774,35214
8,394355,2018,11,1201566,1166183,35383
9,394355,2018,12,1205099,1169731,35368


In [5]:
# Need to make target value from continuous to integer
from sklearn import preprocessing
from sklearn import utils

#convert y values to categorical values
lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)
print(y_transformed)

[ 5  5  5  5  5  4  4  4  4  4  4  3  3  2  2  2  2  2  2  2  1  1  0  0
 16 60 57 50 47 35 32 25 27 26 25 24 22 21 19 17 14 12 11  9  9  9  8  7
  6  5  4  4  4  4 13 13 12 12 12 12 12 12 12 12 11 11 11 11 11 10 10 10
 10 10  9  9  8  8 18 66 61 55 51 38 37 31 29 28 28 28 27 26 24 23 21 19
 17 16 15 15 13 12 11 10 10  9  9  9 17 16 15 15 15 15 16 16 17 18 18 18
 17 16 14 13 12 11 11 11 10 10 11 11 33 69 67 63 61 56 54 48 49 46 43 41
 39 38 37 41 36 34 30 26 25 25 23 21 20 19 19 19 19 20 21 20 19 19 18 17
 17 17 17 16 15 14 13 13 12 12 13 13 13 13 13 12 12 11 33 65 64 59 58 53
 52 49 47 46 45 44 42 40 38 36 35 34 32 31 30 31 29 28 25 23 22 21 20 20
  2  2  2  2  3  3  3  3  3  2  2  1  1  1  0  0  1  1  2  2  3  3  4  5
 68 62 56 54 41 35 30 26 23 20 19 17 16 14 12 10  8  6  5  5  5  4  3  2
  2  2  2  2  2]


In [6]:
# Import sklearn model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,random_state=42)
# Create the scaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
#Fitting the scaler
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Determine the Model Type
Will need to evaluate the type based on accuracy and balance reports

Naive Random Oversampling

In [7]:
# Resample the training data with the RandomOversampler
# Implement Random Oversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({13: 14,
         10: 14,
         2: 14,
         12: 14,
         1: 14,
         15: 14,
         4: 14,
         29: 14,
         16: 14,
         47: 14,
         19: 14,
         25: 14,
         37: 14,
         54: 14,
         24: 14,
         46: 14,
         11: 14,
         18: 14,
         3: 14,
         36: 14,
         5: 14,
         17: 14,
         39: 14,
         8: 14,
         48: 14,
         43: 14,
         49: 14,
         9: 14,
         35: 14,
         31: 14,
         23: 14,
         41: 14,
         21: 14,
         33: 14,
         32: 14,
         61: 14,
         57: 14,
         38: 14,
         45: 14,
         22: 14,
         50: 14,
         14: 14,
         27: 14,
         40: 14,
         63: 14,
         20: 14,
         0: 14,
         30: 14,
         55: 14,
         7: 14,
         52: 14,
         69: 14,
         42: 14,
         26: 14,
         28: 14,
         68: 14,
         67: 14,
         6: 14,
         59: 14,
       

In [8]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=42)
# Fit the model
classifier.fit(X_resampled,y_resampled)

LogisticRegression(max_iter=200, random_state=42)

In [9]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test_scaled)

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.08267195767195767

In [10]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 2, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test,y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      0.97      0.00      0.00      0.00         1
          2       0.00      0.00      1.00      0.00      0.00      0.00         7
          3       0.67      1.00      0.98      0.80      0.99      0.99         2
          4       0.00      0.00      1.00      0.00      0.00      0.00         4
          5       0.00      0.00      0.89      0.00      0.00      0.00         2
          6       0.00      0.00      0.94      0.00      0.00      0.00         0
          7       0.00      0.00      0.99      0.00      0.00      0.00         0
          8       0.50      0.50      0.98      0.50      0.70      0.47         2
          9       0.00      0.00      0.98      0.00      0.00      0.00         2
         10       0.00      0.00      1.00      0.00      0.00      0.00         1
         11       0.25      0.14      0.95      0.18      0.37      0.12         7
   

Model 2: SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=42, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test_scaled)
balanced_accuracy_score(y_test,y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test,y_pred)

In [None]:

# Print the imbalanced classification report
print(classification_report_imbalanced(y_test,y_pred))

### Additional Tests will be summarized to find the highest accuracy