In [1]:
pip install m2cgen

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy import stats
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
import m2cgen as m2c 
import warnings                        # To ignore any warnings
warnings.filterwarnings("ignore")

In [3]:
joined_zips_grb_df = pd.read_csv("app_development/zips_load_demographics.csv")

In [4]:
joined_zips_grb_df = joined_zips_grb_df.dropna()
joined_zips_grb_df.head()


Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0


In [5]:
len(joined_zips_grb_df)

37

In [6]:
joined_zips_grb_df["POPULATION_PER_UNIT"] = joined_zips_grb_df.TOTAL_POPULATION/joined_zips_grb_df.TOTAL_HOUSING_UNITS
joined_zips_grb_df.head()

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778


In [7]:
joined_zips_grb_df["LOAD_PER_CAPITA"] = joined_zips_grb_df.load_weight/joined_zips_grb_df.TOTAL_POPULATION
joined_zips_grb_df

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,LOAD_PER_CAPITA
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463,47.512545
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622,39.641725
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536,58.217143
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033,163.547823
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778,176.612854
5,78704,5156220.0,47158.0,23213.0,66262.0,91223.0,25815.0,1.826767,109.339243
6,78705,1606560.0,33075.0,8370.0,15309.0,41357.0,10336.0,3.199981,48.573243
7,78721,1374760.0,12447.0,4197.0,41611.0,52873.0,4632.0,2.687176,110.449104
8,78722,1471280.0,7254.0,3111.0,69267.0,95443.0,3415.0,2.124158,202.82327
9,78723,5984646.0,33170.0,12518.0,49606.0,71897.0,13777.0,2.407636,180.423455


In [8]:
joined_zips_grb_df

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,LOAD_PER_CAPITA
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463,47.512545
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622,39.641725
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536,58.217143
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033,163.547823
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778,176.612854
5,78704,5156220.0,47158.0,23213.0,66262.0,91223.0,25815.0,1.826767,109.339243
6,78705,1606560.0,33075.0,8370.0,15309.0,41357.0,10336.0,3.199981,48.573243
7,78721,1374760.0,12447.0,4197.0,41611.0,52873.0,4632.0,2.687176,110.449104
8,78722,1471280.0,7254.0,3111.0,69267.0,95443.0,3415.0,2.124158,202.82327
9,78723,5984646.0,33170.0,12518.0,49606.0,71897.0,13777.0,2.407636,180.423455


In [9]:
features = ['TOTAL_POPULATION', 'TOTAL_HOUSEHOLDS', 'MEDIAN_HOUSEHOLD_INCOME', 'TOTAL_HOUSING_UNITS']
# Separating out the features
x = joined_zips_grb_df.loc[:, features]
# Standardizing the features
ss = StandardScaler().fit(x)
X = ss.transform(x)

In [10]:
def binned_trash(number_of_bins, dataSeries):
    bins = {}
    out = []
    min_value = dataSeries.min()
    max_value = dataSeries.max()
    mean_value = dataSeries.mean()

    for bin in range(0,number_of_bins):
        b_float_lower = float(bin)/float(number_of_bins)
        b_float_upper = float(bin+1.)/float(number_of_bins)
        bins[bin] = [min_value + b_float_lower * (max_value - min_value), min_value + b_float_upper*(max_value - min_value)]
    

    for row in dataSeries:
        for bin, bin_range in bins.items():
            if row <= bin_range[1] and row >= bin_range[0]:
                out.append(bin)

    return pd.Series(out), bins

In [11]:
joined_zips_grb_df

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,LOAD_PER_CAPITA
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463,47.512545
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622,39.641725
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536,58.217143
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033,163.547823
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778,176.612854
5,78704,5156220.0,47158.0,23213.0,66262.0,91223.0,25815.0,1.826767,109.339243
6,78705,1606560.0,33075.0,8370.0,15309.0,41357.0,10336.0,3.199981,48.573243
7,78721,1374760.0,12447.0,4197.0,41611.0,52873.0,4632.0,2.687176,110.449104
8,78722,1471280.0,7254.0,3111.0,69267.0,95443.0,3415.0,2.124158,202.82327
9,78723,5984646.0,33170.0,12518.0,49606.0,71897.0,13777.0,2.407636,180.423455


In [12]:
binned_load_per_capita, bins = binned_trash(3, joined_zips_grb_df['LOAD_PER_CAPITA'])

In [13]:
binned_load_per_capita.size

37

In [14]:
binned_load_per_capita

0     0
1     0
2     0
3     1
4     1
5     0
6     0
7     0
8     1
9     1
10    0
11    2
12    0
13    0
14    0
15    0
16    0
17    1
18    0
19    0
20    2
21    0
22    1
23    1
24    0
25    1
26    1
27    1
28    0
29    0
30    0
31    0
32    1
33    1
34    1
35    0
36    1
dtype: int64

In [15]:
bins

{0: [33.47234643445517, 143.48823805133247],
 1: [143.48823805133247, 253.5041296682098],
 2: [253.5041296682098, 363.5200212850871]}

In [16]:
#X = pd.DataFrame(X_reduced)
X = pd.DataFrame(X)

In [17]:
X

Unnamed: 0,0,1,2,3
0,0.03907,-0.671284,-0.69958,-0.658071
1,-0.439705,-0.702146,-0.190288,-0.712127
2,-1.315476,-0.944613,1.471007,-0.760363
3,-0.300767,-0.274505,-0.654474,-0.260696
4,-0.442005,-0.145474,0.972133,-0.075713
5,1.341734,2.069068,-0.208199,2.18573
6,0.389121,-0.367585,-2.070681,-0.184589
7,-1.006213,-1.052631,-1.109266,-1.05805
8,-1.357482,-1.230911,-0.098357,-1.244411
9,0.395547,0.313358,-0.817025,0.342335


In [18]:
# Split data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X, binned_load_per_capita,random_state=1)

In [19]:
#RANDOM FORREST CLASSIFIER MODEL BEGINS
rf = RandomForestClassifier(random_state=1)

In [20]:
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [21]:
model_to_javascript = m2c.export_to_javascript(rf)

In [22]:
with open("./model.js", "w") as f: 
    f.write(model_to_javascript)

In [23]:
print(y_train)

30    0
17    1
28    0
34    1
31    0
26    1
4     1
14    0
10    0
33    1
23    1
32    1
20    2
18    0
6     0
13    0
7     0
36    1
1     0
16    0
0     0
15    0
5     0
11    2
9     1
8     1
12    0
dtype: int64


In [24]:
X_test

Unnamed: 0,0,1,2,3
2,-1.315476,-0.944613,1.471007,-0.760363
29,-0.836971,-0.516479,-0.651696,-0.445525
3,-0.300767,-0.274505,-0.654474,-0.260696
22,1.427911,0.613282,-0.909613,0.565754
25,-0.430776,-0.696893,0.275689,-0.748419
27,0.685261,0.756102,0.744042,0.640941
21,1.690026,1.666051,-1.202914,1.812855
35,1.305207,1.442626,-0.801965,1.458203
19,-1.293559,-1.245357,1.297966,-1.261102
24,0.063963,0.039208,2.345976,0.058277


In [25]:
y_pred = rf.predict(X_test)

In [26]:
print(y_pred)

[1 0 0 0 0 1 0 0 1 1]


In [27]:
score = accuracy_score(y_test, y_pred)

In [28]:
print(score)

0.4


In [29]:
print(y_test)

2     0
29    0
3     1
22    1
25    1
27    1
21    0
35    0
19    0
24    0
dtype: int64


In [30]:
X_test

Unnamed: 0,0,1,2,3
2,-1.315476,-0.944613,1.471007,-0.760363
29,-0.836971,-0.516479,-0.651696,-0.445525
3,-0.300767,-0.274505,-0.654474,-0.260696
22,1.427911,0.613282,-0.909613,0.565754
25,-0.430776,-0.696893,0.275689,-0.748419
27,0.685261,0.756102,0.744042,0.640941
21,1.690026,1.666051,-1.202914,1.812855
35,1.305207,1.442626,-0.801965,1.458203
19,-1.293559,-1.245357,1.297966,-1.261102
24,0.063963,0.039208,2.345976,0.058277


In [31]:
all_pred = rf.predict(X)

In [32]:
joined_zips_grb_df

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,LOAD_PER_CAPITA
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463,47.512545
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622,39.641725
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536,58.217143
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033,163.547823
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778,176.612854
5,78704,5156220.0,47158.0,23213.0,66262.0,91223.0,25815.0,1.826767,109.339243
6,78705,1606560.0,33075.0,8370.0,15309.0,41357.0,10336.0,3.199981,48.573243
7,78721,1374760.0,12447.0,4197.0,41611.0,52873.0,4632.0,2.687176,110.449104
8,78722,1471280.0,7254.0,3111.0,69267.0,95443.0,3415.0,2.124158,202.82327
9,78723,5984646.0,33170.0,12518.0,49606.0,71897.0,13777.0,2.407636,180.423455


In [33]:
joined_zips_grb_df["PREDICTED_BIN"] = all_pred
#all_zips_df["PREDICTED_BIN"] = all_pred
#print(len(all_zips_df))
joined_zips_grb_df["ACTUAL_BIN"] = [float("nan")]*len(joined_zips_grb_df)
#all_zips_df["ACTUAL_BIN"] = [float("nan")]*len(all_zips_df)
joined_zips_grb_df["FINAL_OUTPUT"] = all_pred
#all_zips_df["FINAL_OUTPUT"] = all_pred
bin_names = {0: "LOW", 1: "MEDIUM", 2: "HIGH"}
#bin_names = {0: "LOW", 1: "MEDIUM", 2: "HIGH"}
len(binned_load_per_capita)
#print(binned_load_per_capita)
# Index tracking non Nan rows because binned_load_per_capita removes them from indexing
my_index = 0
for index,row in joined_zips_grb_df.iterrows():
    if pd.notnull(row["LOAD_PER_CAPITA"]):
        print(row["LOAD_PER_CAPITA"])
        # Goes to column 10 and assigns bin number
        joined_zips_grb_df.iloc[[index],[10]] = binned_load_per_capita[my_index]

        joined_zips_grb_df.iloc[[index],[11]] = bin_names[binned_load_per_capita[my_index]]
        my_index += 1
    else:
        joined_zips_grb_df.iloc[[index],[11]] = bin_names[all_pred[index]]
        print("NA")

47.51254480286738
39.641725098453556
58.21714285714286
163.54782304598706
176.61285356936693
109.33924254633361
48.57324263038549
110.4491042018157
202.8232699200441
180.42345492915285
45.90424032537349
363.5200212850871
109.20465434633813
83.52081106245043
33.47234643445517
79.41475844261738
77.65364379267884
147.13739206470652
113.96841341592723
85.71777045981217
316.55985340002974
50.21431166000726
162.46737694086553
178.1340405014465
71.56360549030705
205.04915529254558
177.6826339994302
200.26806931353963
100.55075102412381
124.75884674560172
120.60486651561422
86.59823147564445
181.1714309665521
156.60737734272413
174.43451515025563
135.63323179887598
162.60663846870744


In [34]:
joined_zips_grb_df

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,LOAD_PER_CAPITA,PREDICTED_BIN,ACTUAL_BIN,FINAL_OUTPUT
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463,47.512545,0,0.0,LOW
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622,39.641725,0,0.0,LOW
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536,58.217143,1,0.0,LOW
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033,163.547823,0,1.0,MEDIUM
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778,176.612854,1,1.0,MEDIUM
5,78704,5156220.0,47158.0,23213.0,66262.0,91223.0,25815.0,1.826767,109.339243,0,0.0,LOW
6,78705,1606560.0,33075.0,8370.0,15309.0,41357.0,10336.0,3.199981,48.573243,0,0.0,LOW
7,78721,1374760.0,12447.0,4197.0,41611.0,52873.0,4632.0,2.687176,110.449104,0,0.0,LOW
8,78722,1471280.0,7254.0,3111.0,69267.0,95443.0,3415.0,2.124158,202.82327,1,1.0,MEDIUM
9,78723,5984646.0,33170.0,12518.0,49606.0,71897.0,13777.0,2.407636,180.423455,1,1.0,MEDIUM


In [35]:
x_dashboard = [[50000, 300000, 80000, 81500]]

In [36]:
y_pred_dashboard = rf.predict(x_dashboard)

In [37]:
y_pred_dashboard

array([1])