In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy import stats
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
import m2cgen as m2c 
import warnings                        # To ignore any warnings
warnings.filterwarnings("ignore")

In [2]:
joined_zips_grb_df = pd.read_csv("zips_load_demographics.csv")

In [3]:
all_zips_df = pd.read_csv("full_zip_codes.csv")

In [4]:
all_zips_df.head()

Unnamed: 0,ZIPCODE,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS
0,78701,7875,4855,112201,173289,6576
1,78702,22876,8937,54053,73919,9839
2,78703,20788,9723,98553,166396,11047
3,78704,47158,23213,66262,91223,25815
4,78705,33075,8370,15309,41357,10336


In [5]:
len(all_zips_df)

45

In [6]:
len(joined_zips_grb_df)

37

In [7]:
joined_zips_grb_df.head()

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0


In [8]:
joined_zips_grb_df = joined_zips_grb_df.dropna()
joined_zips_grb_df.head()


Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0


In [9]:
len(joined_zips_grb_df)

37

In [10]:
joined_zips_grb_df["POPULATION_PER_UNIT"] = joined_zips_grb_df.TOTAL_POPULATION/joined_zips_grb_df.TOTAL_HOUSING_UNITS
joined_zips_grb_df.head()

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778


In [11]:
joined_zips_grb_df["LOAD_PER_CAPITA"] = joined_zips_grb_df.load_weight/joined_zips_grb_df.TOTAL_POPULATION
joined_zips_grb_df.head()

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,LOAD_PER_CAPITA
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463,47.512545
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622,39.641725
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536,58.217143
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033,163.547823
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778,176.612854


In [12]:
#features = ['TOTAL_POPULATION', 'TOTAL_HOUSEHOLDS', 'MEDIAN_HOUSEHOLD_INCOME', 'TOTAL_HOUSING_UNITS', 'POPULATION_PER_UNIT']
features = ['TOTAL_POPULATION', 'TOTAL_HOUSEHOLDS', 'MEDIAN_HOUSEHOLD_INCOME', 'TOTAL_HOUSING_UNITS']
# Separating out the features
x = joined_zips_grb_df.loc[:, features]
# Separating out the target
#y = joined_zips_grb_df.loc[:,['load_weight']]
# Standardizing the features
ss = StandardScaler().fit(x)
X = ss.transform(x)

In [13]:
# Apply principal component analysis
#pca = PCA(n_components=3)
#pca.fit(X)
#principalComponents = pca.fit(x)
#PCA_df = pd.DataFrame(data = principalComponents, columns = ['PRINCIPAL_COMPONENT_1', 'PRINCIPAL_COMPONENT_2'])

In [14]:
#pca.components_

In [15]:
#pd.DataFrame(pca.components_, columns=features)

In [16]:
#X_reduced = pca.transform(X)

In [17]:
#X_reduced

In [18]:
#knn = KNeighborsRegressor(n_neighbors=3, weights= 'uniform')

In [19]:
#all_zips_df["POPULATION_PER_UNIT"] = all_zips_df.TOTAL_POPULATION/all_zips_df.TOTAL_HOUSING_UNITS
#all_zips_df.head()

In [20]:
#T = all_zips_df.loc[:, features]

In [21]:
#T_reduced = pca.transform(ss.transform(T))

In [22]:
# Split data into training and testing sets
#X_train,X_test,y_train,y_test = train_test_split(X_reduced,joined_zips_grb_df["LOAD_PER_CAPITA"],random_state=1)

In [23]:
# Use K-nearest-neighbors method to fit data
#model = knn.fit(X_train, y_train)

In [24]:
#y_pred = model.predict(X_test)

In [25]:
#score = mean_absolute_error(y_pred,y_test)

In [26]:
#print(score)

In [27]:
#percent_error = ((abs(y_pred - y_test)/abs(y_test)).mean())*100.

In [28]:
#print(percent_error)

In [29]:
#pca.explained_variance_ratio_

In [30]:
#all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'] = model.predict(T_reduced)

In [31]:
#all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'] = all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'].astype(int)

In [32]:
#all_zips_df["ACTUAL_TRASH_OUTPUT_PER_CAPITA"] = joined_zips_grb_df['LOAD_PER_CAPITA']

In [33]:
#all_zips_df

In [34]:
# Drop outliers
#all_zips_df = all_zips_df.drop([9,23,24,28])

In [35]:
#all_zips_df['DIFFERENCE'] = abs(all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'] - all_zips_df['ACTUAL_TRASH_OUTPUT_PER_CAPITA'])

KeyError: 'PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'

In [65]:
#all_zips_df

In [36]:
def binned_trash(number_of_bins, dataSeries):
    bins = {}
    out = []
    min_value = dataSeries.min()
    max_value = dataSeries.max()
    mean_value = dataSeries.mean()

    for bin in range(0,number_of_bins):
        b_float_lower = float(bin)/float(number_of_bins)
        b_float_upper = float(bin+1.)/float(number_of_bins)
        bins[bin] = [min_value + b_float_lower * (max_value - min_value), min_value + b_float_upper*(max_value - min_value)]
    

    for row in dataSeries:
        for bin, bin_range in bins.items():
            if row <= bin_range[1] and row >= bin_range[0]:
                out.append(bin)

    return pd.Series(out), bins

In [40]:
joined_zips_grb_df

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,LOAD_PER_CAPITA
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463,47.512545
1,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622,39.641725
2,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536,58.217143
3,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033,163.547823
4,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778,176.612854
5,78704,5156220.0,47158.0,23213.0,66262.0,91223.0,25815.0,1.826767,109.339243
6,78705,1606560.0,33075.0,8370.0,15309.0,41357.0,10336.0,3.199981,48.573243
7,78721,1374760.0,12447.0,4197.0,41611.0,52873.0,4632.0,2.687176,110.449104
8,78722,1471280.0,7254.0,3111.0,69267.0,95443.0,3415.0,2.124158,202.82327
9,78723,5984646.0,33170.0,12518.0,49606.0,71897.0,13777.0,2.407636,180.423455


In [41]:
binned_load_per_capita, bins = binned_trash(3, joined_zips_grb_df['LOAD_PER_CAPITA'])

In [42]:
binned_load_per_capita.size

37

In [40]:
binned_load_per_capita

0     0
1     0
2     0
3     1
4     1
5     0
6     0
7     0
8     1
9     1
10    0
11    2
12    0
13    0
14    0
15    0
16    0
17    1
18    0
19    0
20    2
21    0
22    1
23    1
24    0
25    1
26    1
27    1
28    0
29    0
30    0
31    0
32    1
33    1
34    1
35    0
36    1
dtype: int64

In [43]:
bins

{0: [33.47234643445517, 143.48823805133247],
 1: [143.48823805133247, 253.5041296682098],
 2: [253.5041296682098, 363.5200212850871]}

In [44]:
#X = pd.DataFrame(X_reduced)
X = pd.DataFrame(X)

In [45]:
X.head()

Unnamed: 0,0,1,2,3
0,0.03907,-0.671284,-0.69958,-0.658071
1,-0.439705,-0.702146,-0.190288,-0.712127
2,-1.315476,-0.944613,1.471007,-0.760363
3,-0.300767,-0.274505,-0.654474,-0.260696
4,-0.442005,-0.145474,0.972133,-0.075713


In [46]:
# Split data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X, binned_load_per_capita,random_state=1)

In [47]:
#RANDOM FORREST CLASSIFIER MODEL BEGINS
rf = RandomForestClassifier(random_state=1)

In [48]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [49]:
model_to_javascript = m2c.export_to_javascript(rf)

In [50]:
model_to_javascript

                   } else {\n                            if ((input[3]) <= (0.9757629483938217)) {\n                                var85 = [1.0, 0.0, 0.0];\n                            } else {\n                                if ((input[0]) <= (1.1621085405349731)) {\n                                    var85 = [0.0, 1.0, 0.0];\n                                } else {\n                                    var85 = [1.0, 0.0, 0.0];\n                                }\n                            }\n                        }\n                    }\n                } else {\n                    var85 = [0.0, 1.0, 0.0];\n                }\n            } else {\n                var85 = [0.0, 0.0, 1.0];\n            }\n        }\n    }\n    var var86;\n    if ((input[0]) <= (0.3923342525959015)) {\n        if ((input[0]) <= (-1.2627143263816833)) {\n            if ((input[2]) <= (-0.22802868857979774)) {\n                var86 = [0.0, 0.0, 1.0];\n            } else {\n                var86 =

In [51]:
print(y_train)

30    0
17    1
28    0
34    1
31    0
26    1
4     1
14    0
10    0
33    1
23    1
32    1
20    2
18    0
6     0
13    0
7     0
36    1
1     0
16    0
0     0
15    0
5     0
11    2
9     1
8     1
12    0
dtype: int64


In [52]:
X_test

Unnamed: 0,0,1,2,3
2,-1.315476,-0.944613,1.471007,-0.760363
29,-0.836971,-0.516479,-0.651696,-0.445525
3,-0.300767,-0.274505,-0.654474,-0.260696
22,1.427911,0.613282,-0.909613,0.565754
25,-0.430776,-0.696893,0.275689,-0.748419
27,0.685261,0.756102,0.744042,0.640941
21,1.690026,1.666051,-1.202914,1.812855
35,1.305207,1.442626,-0.801965,1.458203
19,-1.293559,-1.245357,1.297966,-1.261102
24,0.063963,0.039208,2.345976,0.058277


In [53]:
y_pred = rf.predict(X_test)

In [54]:
print(y_pred)

[1 0 0 0 0 1 0 0 1 1]


In [55]:
score = accuracy_score(y_test, y_pred)

In [56]:
print(score)

0.4


In [57]:
print(y_test)

2     0
29    0
3     1
22    1
25    1
27    1
21    0
35    0
19    0
24    0
dtype: int64


In [61]:
X_test

Unnamed: 0,0,1,2,3
2,-1.315476,-0.944613,1.471007,-0.760363
29,-0.836971,-0.516479,-0.651696,-0.445525
3,-0.300767,-0.274505,-0.654474,-0.260696
22,1.427911,0.613282,-0.909613,0.565754
25,-0.430776,-0.696893,0.275689,-0.748419
27,0.685261,0.756102,0.744042,0.640941
21,1.690026,1.666051,-1.202914,1.812855
35,1.305207,1.442626,-0.801965,1.458203
19,-1.293559,-1.245357,1.297966,-1.261102
24,0.063963,0.039208,2.345976,0.058277


In [62]:
#T_reduced

NameError: name 'T_reduced' is not defined

In [63]:
all_pred = rf.predict(T_reduced)

NameError: name 'T_reduced' is not defined

In [57]:
all_zips_df["PREDICTED_BIN"] = all_pred
#print(len(all_zips_df))
all_zips_df["ACTUAL_BIN"] = [float("nan")]*len(all_zips_df)
all_zips_df["FINAL_OUTPUT"] = all_pred
bin_names = {0: "LOW", 1: "MEDIUM", 2: "HIGH"}
len(binned_load_per_capita)
print(binned_load_per_capita)
# Index tracking non Nan rows because binned_load_per_capita removes them from indexing
my_index = 0
for index,row in all_zips_df.iterrows():
    if not pd.isna(row["ACTUAL_TRASH_OUTPUT_PER_CAPITA"]):
        print(row["ACTUAL_TRASH_OUTPUT_PER_CAPITA"])
        # Goes to column 11 and assigns bin number
        all_zips_df.iloc[[index],[11]] = binned_load_per_capita[my_index]

        all_zips_df.iloc[[index],[12]] = bin_names[binned_load_per_capita[my_index]]
        my_index += 1
    else:
        all_zips_df.iloc[[index],[12]] = bin_names[all_pred[index]]

        print("NA")

0     0
1     0
2     0
3     1
4     1
5     0
6     0
7     0
8     1
9     1
10    0
11    2
12    0
13    0
14    0
15    0
16    0
17    1
18    0
19    0
20    2
21    0
22    1
23    1
24    0
25    1
26    1
27    1
28    0
29    0
30    0
31    0
32    1
33    1
34    1
35    0
36    1
dtype: int64
47.51254480286738
39.641725098453556
58.21714285714286
163.54782304598706
176.61285356936693
109.33924254633361
48.57324263038549
110.4491042018157
202.8232699200441
180.42345492915285
45.90424032537349
363.5200212850871
109.20465434633813
83.52081106245043
33.47234643445517
79.41475844261738
77.65364379267884
147.13739206470652
113.96841341592723
85.71777045981217
316.55985340002974
50.21431166000726
162.46737694086553
178.1340405014465
71.56360549030705
205.04915529254558
177.6826339994302
200.26806931353963
100.55075102412381
124.75884674560172
120.60486651561422
86.59823147564445
181.1714309665521
156.60737734272413
174.43451515025563
135.63323179887598
162.60663846870744
NA
NA


In [64]:
#all_zips_df

In [72]:
x_dashboard = [[50000, 300000, 80000, 81500]]

In [73]:
y_pred_dashboard = rf.predict(x_dashboard)

In [74]:
y_pred_dashboard

array([1])