# Preliminary Model Build for Mockup

In [2]:
# Import Warnings and Dependencies
import warnings
warnings.filterwarnings('ignore')
# Import Dependencies
import pandas as pd
import numpy as np
import matplotlib
from pathlib import Path
from collections import Counter



In [3]:
# import sklearn functions as needed 2022.11.1 Load then remove
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

### Data to Load

In [4]:
# Load in Data
file_path = Path("..\Data\Model_Data\combined_data.csv") 
# Pulled in cleaned combined_data from SQL database directly
housing_data_df = pd.read_csv(file_path)
housing_data_df


Unnamed: 0,region_id,region_desc,week,sales_inventory,civilian_labor_force,employment,unemployment,unemployment_rate,med_house_price,rent_index,sales_count
0,394463,"Chicago, IL",3/31/2018,38581,4963614.0,4753730.0,209884.0,4.2,284600,1610.46,9053
1,394463,"Chicago, IL",4/30/2018,42253,4960889.0,4756797.0,204092.0,4.1,294600,1622.52,11208
2,394463,"Chicago, IL",5/31/2018,45757,4959982.0,4759532.0,200450.0,4.0,300600,1634.70,13708
3,394463,"Chicago, IL",6/30/2018,47492,4961285.0,4762416.0,198869.0,4.0,302267,1645.26,15524
4,394463,"Chicago, IL",7/31/2018,48984,4964193.0,4765720.0,198473.0,4.0,301967,1651.54,14358
...,...,...,...,...,...,...,...,...,...,...,...
270,394902,"Nashville, TN",5/31/2022,4590,1138543.0,1107909.0,30634.0,2.7,479967,1860.25,3382
271,394902,"Nashville, TN",6/30/2022,5436,1139523.0,1108733.0,30790.0,2.7,494966,1884.61,3700
272,394902,"Nashville, TN",7/31/2022,6341,1137422.0,1106593.0,30829.0,2.7,498300,1910.61,3041
273,394902,"Nashville, TN",8/31/2022,7106,1134574.0,1103911.0,30663.0,2.7,499633,1923.84,2859


In [5]:
# Check data types
housing_data_df.dtypes

region_id                 int64
region_desc              object
week                     object
sales_inventory           int64
civilian_labor_force    float64
employment              float64
unemployment            float64
unemployment_rate       float64
med_house_price           int64
rent_index              float64
sales_count               int64
dtype: object

In [None]:
# Need to remove latest data with 9/30/2022 since unemployment wasn't included
#housing_data_df = housing_data_df.loc[(housing_data_df["week"] == "9/30/2022")]
#housing_data_df

In [6]:
housing_data_cleaned = housing_data_df.drop(housing_data_df.index[[54,109,164,219,274]])
housing_data_cleaned.shape

(270, 11)

### Split the Data Into Training and Testing

In [7]:
# Create our features
y = housing_data_df["med_house_price"]
X = housing_data_df.drop(columns=["med_house_price","region_desc","week"]) # TBD based on factors to pull in
X.head(10) #Confirm Out has med_house_price dropped? =Yes

Unnamed: 0,region_id,sales_inventory,civilian_labor_force,employment,unemployment,unemployment_rate,rent_index,sales_count
0,394463,38581,4963614.0,4753730.0,209884.0,4.2,1610.46,9053
1,394463,42253,4960889.0,4756797.0,204092.0,4.1,1622.52,11208
2,394463,45757,4959982.0,4759532.0,200450.0,4.0,1634.7,13708
3,394463,47492,4961285.0,4762416.0,198869.0,4.0,1645.26,15524
4,394463,48984,4964193.0,4765720.0,198473.0,4.0,1651.54,14358
5,394463,49782,4969483.0,4770051.0,199432.0,4.0,1650.94,13634
6,394463,49630,4976756.0,4775171.0,201585.0,4.1,1643.64,10809
7,394463,48916,4984436.0,4779818.0,204618.0,4.1,1632.04,11088
8,394463,46495,4992719.0,4784079.0,208640.0,4.2,1623.91,9566
9,394463,42304,5000345.0,4787426.0,212919.0,4.3,1618.47,8519


In [8]:
# Import sklearn model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)
# Create the scaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
#Fitting the scaler
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Determine the Model Type
Will need to evaluate the type based on accuracy and balance reports

Naive Random Oversampling

In [9]:
# Resample the training data with the RandomOversampler
# Implement Random Oversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({327233: 4,
         315929: 4,
         396400: 4,
         357633: 4,
         283333: 4,
         298998: 4,
         302133: 4,
         430000: 4,
         323300: 4,
         301132: 4,
         305800: 4,
         294467: 4,
         354967: 4,
         497667: 4,
         311667: 4,
         384850: 4,
         300833: 4,
         299633: 4,
         333333: 4,
         316583: 4,
         347047: 4,
         306300: 4,
         351633: 4,
         284333: 4,
         351667: 4,
         302467: 4,
         348633: 4,
         363000: 4,
         469560: 4,
         347048: 4,
         498300: 4,
         381267: 4,
         299300: 4,
         499633: 4,
         300600: 4,
         286000: 4,
         358000: 4,
         318000: 4,
         345000: 4,
         307948: 4,
         413423: 4,
         362665: 4,
         327633: 4,
         289316: 4,
         307667: 4,
         332933: 4,
         589667: 4,
         328133: 4,
         294667: 4,
         419817: 4,


In [10]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=42)
# Fit the model
classifier.fit(X_resampled,y_resampled)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# Calculated the balanced accuracy score
y_pred = classifier.predict(X_test_scaled)

from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test,y_pred))

Model 2: SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=42, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test_scaled)
balanced_accuracy_score(y_test,y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test,y_pred)

In [None]:

# Print the imbalanced classification report
print(classification_report_imbalanced(y_test,y_pred))

### Additional Tests will be summarized to find the highest accuracy