In [1]:
import pandas as pd
import numpy as np

In [2]:
file = pd.read_csv('./usa_blank.csv')
df = pd.DataFrame(file)

In [3]:
df.set_index('zip', inplace=True)


In [4]:
#assign random 'yes' outcome to the Outcome column
df['Outcome'] = np.where((df['Rent Change'] > df['Rent Change'].median()+200), 1, 0)

### Machine Learning Part

In [5]:
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [6]:
X = df.copy()
X.drop('Outcome', axis=1)
y = df['Outcome']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)
Counter(y_train)

Counter({1: 109, 0: 716})

In [8]:
scaler = StandardScaler()

In [9]:
# Scaling the data.
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
rf_model = RandomForestClassifier(n_estimators = 20, random_state = 1)

In [11]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [12]:
predictions = rf_model.predict(X_test_scaled)

In [13]:
# Create a DataFrame from the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,240,0
Actual 1,0,36


In [14]:
acc = accuracy_score(y_test, predictions)
acc

1.0

In [15]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,240,0
Actual 1,0,36


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       240
           1       1.00      1.00      1.00        36

    accuracy                           1.00       276
   macro avg       1.00      1.00      1.00       276
weighted avg       1.00      1.00      1.00       276



In [16]:
rf_model = rf_model.feature_importances_
rf_model

array([0.00914849, 0.01284033, 0.00071419, 0.00995009, 0.03176754,
       0.00969938, 0.03602121, 0.07386435, 0.41061348, 0.00180612,
       0.00166794, 0.00208296, 0.        , 0.00173015, 0.00173209,
       0.        , 0.0014145 , 0.0051249 , 0.38982227])

In [17]:
sorted(zip(rf_model, X.columns), reverse=True)

[(0.4106134757437413, 'Rent Change'),
 (0.3898222743887092, 'Outcome'),
 (0.07386434908999943, 'Median Rent 2014'),
 (0.036021213208124875, 'Median Rent 2000'),
 (0.03176754094595925, 'Median Income 2014'),
 (0.012840332606018686, 'Average Education Index 2014'),
 (0.009950094991253803, 'Median Income 2000'),
 (0.009699377773448172, 'Income Change'),
 (0.00914849154710444, 'Average Education Index 2000'),
 (0.005124901941439243, 'White Population % Change'),
 (0.002082960044443866, 'Take Public Transp % Change'),
 (0.001806118261850289, 'Take Public Transp % in 2000'),
 (0.001732086847010307, 'Total Population % Change'),
 (0.0017301548879904017, 'Total Population % in 2014'),
 (0.0016679376321252534, 'Take Public Transp % in 2014'),
 (0.0014144971582294014, 'White Population % in 2014'),
 (0.0007141929325521514, 'Index Change'),
 (0.0, 'White Population % in 2000'),
 (0.0, 'Total Population % in 2000')]

In [18]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=100,
                                random_state=1)

In [19]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)


In [21]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9601449275362319


In [22]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[234   6]
 [  5  31]]


In [23]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.98      0.97      0.98       240
           1       0.84      0.86      0.85        36

    accuracy                           0.96       276
   macro avg       0.91      0.92      0.91       276
weighted avg       0.96      0.96      0.96       276

