In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv('../Resources/final_data.csv')
df.head()

Unnamed: 0,county,year,violent_crime,murder,rape,robbery,aggravated_assault,property_crime,burglary,larceny_theft,...,frm_15,points_15,median_hh_income,median_hh_inc_moe,poverty_count,poverty_count_moe,poverty_rate,poverty_rate_moe,county_fips,price
0,Atlantic,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,62678,2822,29057,4251,1.6,11.3,1,196067.42
1,Bergen,2019,2,0,2,0,0,46,2,44,...,3.391731,0.475,107971,3025,52980,7662,0.8,5.7,3,494018.42
2,Burlington,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,88443,3233,24961,4374,1.0,5.7,5,238593.67
3,Camden,2019,2,0,0,0,2,44,4,38,...,3.391731,0.475,73168,2374,53641,7048,1.4,10.7,7,181980.75
4,Cape May,2019,0,0,0,0,0,0,0,0,...,3.391731,0.475,66565,4753,8853,1981,2.2,9.8,9,389294.58


In [3]:
df.dropna(how='any', inplace = True)

In [None]:
df.dtypes

In [4]:
county_list = df['county'].unique().tolist()
county_list

['Atlantic',
 'Bergen',
 'Burlington',
 'Camden',
 'Cape May',
 'Cumberland',
 'Essex',
 'Gloucester',
 'Hudson',
 'Hunterdon',
 'Mercer',
 'Middlesex',
 'Monmouth',
 'Morris',
 'Ocean',
 'Passaic',
 'Salem',
 'Somerset',
 'Sussex',
 'Union',
 'Warren']

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib

In [None]:
county_name

In [6]:
for i in range(len(county_list)):
    county_name = county_list[i]
    df_filtered = df[df['county']==county_list[i]]
    X = df_filtered.drop(["price", 'county', 'county_fips'], axis=1)
    y = df_filtered["price"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    lin_reg_model = LinearRegression()
    lin_reg_model.fit(X, y)
    filename = f'../Models/NJ_lin_reg_{county_name}.sav'
    joblib.dump(lin_reg_model, filename)

In [None]:
df = df[df['county']=='Atlantic']
df.head()

In [None]:
df.describe()

In [None]:
# df["county"] = df["county"].astype('category')
# df["county_cat"] = df["county"].cat.codes
# df.head()

# linear regression without scaling

In [None]:
X = df.drop(["price", 'county', 'county_fips'], axis=1)
y = df["price"]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg_model = LinearRegression()

In [None]:
lin_reg_model.fit(X, y)
print(lin_reg_model)

In [None]:
print('Weight coefficients: ', lin_reg_model.coef_)
print('y-axis intercept: ', lin_reg_model.intercept_) 

In [None]:
X.columns

In [None]:
print(f"Training Data Score: {lin_reg_model.score(X_train, y_train)}")
print(f"Testing Data Score: {lin_reg_model.score(X_test, y_test)}")

In [None]:
predictions = lin_reg_model.predict(X_test)
print(f"Predicted Labels: {predictions[:5]}")
print(f"Actual Labels: {list(y_test[:5])}")

In [None]:
best_array = pd.DataFrame({'year':2021,'violent_crime':df['violent_crime'].min(), 'murder':df['murder'].min(), 'rape':df['rape'].min(),
            'robbery':df['robbery'].min(), 'aggravated_assault':df['aggravated_assault'].min(), 'property_crime': df['property_crime'].min(),
            'burglary':df['burglary'].min(), 'larceny_theft':df['larceny_theft'].min(), 'motor_vehicle_theft':df['motor_vehicle_theft'].min(),
            'arson':df['arson'].min(), 'frm_30':df['frm_30'].min(), 'points_30': df['points_30'].min(),
           'frm_15':df['frm_15'].min(), 'points_15':df['points_15'].min(), 'median_hh_income': df['median_hh_income'].max(),
            'median_hh_inc_moe': df['median_hh_inc_moe'].min(),               
           'poverty_count': df['poverty_count'].min(),'poverty_count_moe': df['poverty_count_moe'].min(), 'poverty_rate':df['poverty_rate'].min(),
            'poverty_rate_moe':df['poverty_rate_moe'].min()},[0])

In [None]:
b_predictions = lin_reg_model.predict(best_array)
predictions

In [None]:
import joblib
filename = '../Models/NJ_lin_reg_Atlantic.sav'
joblib.dump(lin_reg_model, filename)

# linear regression with scaling

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

X = df.drop(["price", 'county'], axis=1)
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# print(X.shape, y.shape)
# YOUR CODE HERE
# y_train.values.reshape(-1,1)
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train.values.reshape(-1,1))

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train.values.reshape(-1,1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1,1))

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
# YOUR CODE HERE
predictions = model.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
from sklearn.linear_model import Lasso
reg_lasso = Lasso(alpha=.01)
reg_lasso.fit(X_train_scaled, y_train_scaled)
# YOUR CODE HERE
predictions_lasso = reg_lasso.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions_lasso)
r2 = model.score(X_test_scaled, y_test_scaled)
# YOUR CODE HERE

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
from sklearn.linear_model import Ridge
reg_ridge = Ridge(alpha=.01)
reg_ridge.fit(X_train_scaled, y_train_scaled)
# YOUR CODE HERE
predictions_ridge = reg_ridge.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions_ridge)
r2 = model.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

In [None]:
from sklearn.linear_model import ElasticNet
reg_elasticnet = ElasticNet(alpha=.01)
reg_elasticnet.fit(X_train_scaled, y_train_scaled)
# YOUR CODE HERE
predictions_elasticnet = reg_elasticnet.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions_elasticnet)
r2 = model.score(X_test_scaled, y_test_scaled)
# YOUR CODE HERE

print(f"MSE: {MSE}, R2: {r2}")

# Observation
* It seems like our accuracy drops with scaling the data 