In [99]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy import stats
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor

In [100]:
joined_zips_grb_df = pd.read_csv("zips_load_demographics.csv")
all_zips_df = pd.read_csv("full_zip_codes.csv")

In [101]:
all_zips_df.head()

Unnamed: 0,ZIPCODE,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS
0,78701,7875,4855,112201,173289,6576
1,78702,22876,8937,54053,73919,9839
2,78703,20788,9723,98553,166396,11047
3,78704,47158,23213,66262,91223,25815
4,78705,33075,8370,15309,41357,10336


In [102]:
len(all_zips_df)

45

In [103]:
len(joined_zips_grb_df)

40

In [104]:
joined_zips_grb_df.head()

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0
1,78652,908280.0,,,,,
2,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0
3,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0
4,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0


In [105]:
joined_zips_grb_df = joined_zips_grb_df.dropna()
joined_zips_grb_df.head()


Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0
2,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0
3,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0
4,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0
5,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0


In [106]:
len(joined_zips_grb_df)

37

In [107]:
joined_zips_grb_df["POPULATION_PER_UNIT"] = joined_zips_grb_df.TOTAL_POPULATION/joined_zips_grb_df.TOTAL_HOUSING_UNITS
joined_zips_grb_df.head()

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463
2,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622
3,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536
4,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033
5,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778


In [108]:
joined_zips_grb_df["LOAD_PER_CAPITA"] = joined_zips_grb_df.load_weight/joined_zips_grb_df.TOTAL_POPULATION
joined_zips_grb_df.head()

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,LOAD_PER_CAPITA
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463,47.512545
2,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622,39.641725
3,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536,58.217143
4,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033,163.547823
5,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778,176.612854


In [109]:
features = ['TOTAL_POPULATION', 'TOTAL_HOUSEHOLDS', 'MEDIAN_HOUSEHOLD_INCOME', 'TOTAL_HOUSING_UNITS', 'POPULATION_PER_UNIT']
# Separating out the features
x = joined_zips_grb_df.loc[:, features]
# Separating out the target
#y = joined_zips_grb_df.loc[:,['load_weight']]
# Standardizing the features
ss = StandardScaler().fit(x)
X = ss.transform(x)

In [110]:
# Apply principal component analysis
pca = PCA(n_components=3)
pca.fit(X)
#principalComponents = pca.fit(x)
#PCA_df = pd.DataFrame(data = principalComponents, columns = ['PRINCIPAL_COMPONENT_1', 'PRINCIPAL_COMPONENT_2'])

PCA(n_components=3)

In [111]:
pca.components_

array([[ 0.55593522,  0.56556387, -0.21785227,  0.56580689, -0.05896181],
       [ 0.11745993, -0.1295423 , -0.60106462, -0.13735661,  0.76764343],
       [ 0.22954666,  0.08363969,  0.76852846,  0.04817768,  0.58936886]])

In [112]:
pd.DataFrame(pca.components_, columns=features)

Unnamed: 0,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT
0,0.555935,0.565564,-0.217852,0.565807,-0.058962
1,0.11746,-0.129542,-0.601065,-0.137357,0.767643
2,0.229547,0.08364,0.768528,0.048178,0.589369


In [113]:
X_reduced = pca.transform(X)

In [114]:
X_reduced

array([[-0.73158994,  2.60376743,  0.92002373],
       [-1.06675618,  1.08120218,  0.29680464],
       [-1.88215531, -2.55756196, -0.62736463],
       [-0.31556893,  0.27562414, -0.72562083],
       [-0.52273962, -1.38660683,  0.03127252],
       [ 3.26401178, -1.14280248, -0.23185058],
       [ 0.27202382,  2.44481355, -0.71134065],
       [-1.53917529,  1.18774575, -0.9480121 ],
       [-2.09990307, -0.20735542, -0.88592741],
       [ 0.77166421,  0.41275853, -0.52294738],
       [-0.8242857 ,  2.51480006,  0.46202719],
       [-2.24036205,  1.06590475, -0.26114191],
       [-1.55819984, -0.23382654,  0.15401188],
       [ 0.51458134, -0.66143755, -0.25112066],
       [-0.18435257, -0.2710019 , -1.03964945],
       [ 0.46895903, -0.48286348, -0.19207015],
       [-2.25688392, -0.94712793,  0.36834957],
       [ 0.25963362, -1.01854877,  0.09387302],
       [-1.17971632, -0.94553709,  0.37902506],
       [-2.4247659 , -0.53250698,  0.58561502],
       [-1.78073519, -0.54804291,  2.671

In [115]:
knn = KNeighborsRegressor(n_neighbors=3, weights= 'uniform')

In [116]:
all_zips_df["POPULATION_PER_UNIT"] = all_zips_df.TOTAL_POPULATION/all_zips_df.TOTAL_HOUSING_UNITS
all_zips_df.head()

Unnamed: 0,ZIPCODE,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT
0,78701,7875,4855,112201,173289,6576,1.197536
1,78702,22876,8937,54053,73919,9839,2.325033
2,78703,20788,9723,98553,166396,11047,1.881778
3,78704,47158,23213,66262,91223,25815,1.826767
4,78705,33075,8370,15309,41357,10336,3.199981


In [117]:
T = all_zips_df.loc[:, features]

In [118]:
T_reduced = pca.transform(ss.transform(T))

In [119]:
# Split data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X_reduced,joined_zips_grb_df["LOAD_PER_CAPITA"],random_state=1)

In [120]:
# Use K-nearest-neighbors method to fit data
model = knn.fit(X_train, y_train)

In [121]:
y_pred = model.predict(X_test)

In [122]:
score = mean_absolute_error(y_pred,y_test)

In [123]:
print(score)

50.6587630539775


In [124]:
percent_error = ((abs(y_pred - y_test)/abs(y_test)).mean())*100.

In [125]:
print(percent_error)

54.56697927876088


In [126]:
pca.explained_variance_ratio_

array([0.60654758, 0.25745225, 0.13239989])

In [127]:
all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'] = model.predict(T_reduced)

In [128]:
all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'] = all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'].astype(int)

In [129]:
all_zips_df["ACTUAL_TRASH_OUTPUT_PER_CAPITA"] = joined_zips_grb_df['LOAD_PER_CAPITA']

In [130]:
all_zips_df

Unnamed: 0,ZIPCODE,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS),ACTUAL_TRASH_OUTPUT_PER_CAPITA
0,78701,7875,4855,112201,173289,6576,1.197536,136,47.512545
1,78702,22876,8937,54053,73919,9839,2.325033,109,
2,78703,20788,9723,98553,166396,11047,1.881778,145,39.641725
3,78704,47158,23213,66262,91223,25815,1.826767,150,58.217143
4,78705,33075,8370,15309,41357,10336,3.199981,47,163.547823
5,78721,12447,4197,41611,52873,4632,2.687176,198,176.612854
6,78722,7254,3111,69267,95443,3415,2.124158,156,109.339243
7,78723,33170,12518,49606,71897,13777,2.407636,144,48.573243
8,78724,24833,6360,45158,55252,6754,3.676784,44,
9,78725,7517,2544,62172,66225,2598,2.89338,171,


In [131]:
# Drop outliers
#all_zips_df = all_zips_df.drop([9,23,24,28])

In [132]:
all_zips_df['DIFFERENCE'] = abs(all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'] - all_zips_df['ACTUAL_TRASH_OUTPUT_PER_CAPITA'])

In [133]:
all_zips_df

Unnamed: 0,ZIPCODE,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS),ACTUAL_TRASH_OUTPUT_PER_CAPITA,DIFFERENCE
0,78701,7875,4855,112201,173289,6576,1.197536,136,47.512545,88.487455
1,78702,22876,8937,54053,73919,9839,2.325033,109,,
2,78703,20788,9723,98553,166396,11047,1.881778,145,39.641725,105.358275
3,78704,47158,23213,66262,91223,25815,1.826767,150,58.217143,91.782857
4,78705,33075,8370,15309,41357,10336,3.199981,47,163.547823,116.547823
5,78721,12447,4197,41611,52873,4632,2.687176,198,176.612854,21.387146
6,78722,7254,3111,69267,95443,3415,2.124158,156,109.339243,46.660757
7,78723,33170,12518,49606,71897,13777,2.407636,144,48.573243,95.426757
8,78724,24833,6360,45158,55252,6754,3.676784,44,,
9,78725,7517,2544,62172,66225,2598,2.89338,171,,


In [134]:
def binned_trash(number_of_bins, dataSeries):
    bins = {}
    out = []
    min_value = dataSeries.min()
    max_value = dataSeries.max()
    mean_value = dataSeries.mean()

    for bin in range(0,number_of_bins):
        b_float_lower = float(bin)/float(number_of_bins)
        b_float_upper = float(bin+1.)/float(number_of_bins)
        bins[bin] = [min_value + b_float_lower * (max_value - min_value), min_value + b_float_upper*(max_value - min_value)]
    

    for row in dataSeries:
        for bin, bin_range in bins.items():
            if row <= bin_range[1] and row >= bin_range[0]:
                out.append(bin)

    return pd.Series(out), bins

In [135]:
binned_load_per_capita, bins = binned_trash(3, all_zips_df['ACTUAL_TRASH_OUTPUT_PER_CAPITA'])

In [136]:
binned_load_per_capita.size

37

In [137]:
binned_load_per_capita

0     0
1     0
2     0
3     1
4     1
5     0
6     0
7     0
8     1
9     1
10    0
11    2
12    0
13    0
14    0
15    0
16    0
17    1
18    0
19    0
20    2
21    0
22    1
23    1
24    0
25    1
26    1
27    1
28    0
29    0
30    0
31    0
32    1
33    1
34    1
35    0
36    1
dtype: int64

In [138]:
bins

{0: [33.47234643445517, 143.48823805133247],
 1: [143.48823805133247, 253.5041296682098],
 2: [253.5041296682098, 363.5200212850871]}

In [139]:
X = pd.DataFrame(X_reduced)

In [140]:
X.head()

Unnamed: 0,0,1,2
0,-0.73159,2.603767,0.920024
1,-1.066756,1.081202,0.296805
2,-1.882155,-2.557562,-0.627365
3,-0.315569,0.275624,-0.725621
4,-0.52274,-1.386607,0.031273


In [153]:
# Split data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X, binned_load_per_capita,random_state=1)

In [142]:
rf = RandomForestClassifier(random_state=1)

In [143]:
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [144]:
print(y_train)

30    0
17    1
28    0
34    1
31    0
26    1
4     1
14    0
10    0
33    1
23    1
32    1
20    2
18    0
6     0
13    0
7     0
36    1
1     0
16    0
0     0
15    0
5     0
11    2
9     1
8     1
12    0
dtype: int64


In [145]:
y_pred = rf.predict(X_test)

In [146]:
print(y_pred)

[1 0 1 0 0 1 1 1 0 1]


In [147]:
score = accuracy_score(y_test, y_pred)

In [148]:
print(score)

0.4


In [149]:
print(y_test)

2     0
29    0
3     1
22    1
25    1
27    1
21    0
35    0
19    0
24    0
dtype: int64


In [150]:
all_pred = rf.predict(T_reduced)

In [151]:
all_zips_df["PREDICTED_BIN"] = all_pred
#print(len(all_zips_df))
all_zips_df["ACTUAL_BIN"] = [float("nan")]*len(all_zips_df)
all_zips_df["FINAL_OUTPUT"] = all_pred
bin_names = {0: "LOW", 1: "MEDIUM", 2: "HIGH"}
len(binned_load_per_capita)
print(binned_load_per_capita)
# Index tracking non Nan rows because binned_load_per_capita removes them from indexing
my_index = 0
for index,row in all_zips_df.iterrows():
    if not pd.isna(row["ACTUAL_TRASH_OUTPUT_PER_CAPITA"]):
        print(row["ACTUAL_TRASH_OUTPUT_PER_CAPITA"])
        # Goes to column 11 and assigns bin number
        all_zips_df.iloc[[index],[11]] = binned_load_per_capita[my_index]

        all_zips_df.iloc[[index],[12]] = bin_names[binned_load_per_capita[my_index]]
        my_index += 1
    else:
        all_zips_df.iloc[[index],[12]] = bin_names[all_pred[index]]

        print("NA")

0     0
1     0
2     0
3     1
4     1
5     0
6     0
7     0
8     1
9     1
10    0
11    2
12    0
13    0
14    0
15    0
16    0
17    1
18    0
19    0
20    2
21    0
22    1
23    1
24    0
25    1
26    1
27    1
28    0
29    0
30    0
31    0
32    1
33    1
34    1
35    0
36    1
dtype: int64
47.51254480286738
NA
39.641725098453556
58.21714285714286
163.54782304598706
176.61285356936693
109.33924254633361
48.57324263038549
NA
NA
110.4491042018157
202.8232699200441
180.42345492915285
45.90424032537349
363.5200212850871
109.20465434633813
83.52081106245043
33.47234643445517
79.41475844261738
77.65364379267884
147.13739206470652
113.96841341592723
85.71777045981217
316.55985340002974
50.21431166000726
162.46737694086553
178.1340405014465
71.56360549030705
205.04915529254558
177.6826339994302
200.26806931353963
100.55075102412381
124.75884674560172
120.60486651561422
86.59823147564445
181.1714309665521
156.60737734272413
174.43451515025563
135.63323179887598
162.606638468707

In [152]:
all_zips_df

Unnamed: 0,ZIPCODE,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS),ACTUAL_TRASH_OUTPUT_PER_CAPITA,DIFFERENCE,PREDICTED_BIN,ACTUAL_BIN,FINAL_OUTPUT
0,78701,7875,4855,112201,173289,6576,1.197536,136,47.512545,88.487455,1,0.0,LOW
1,78702,22876,8937,54053,73919,9839,2.325033,109,,,1,,MEDIUM
2,78703,20788,9723,98553,166396,11047,1.881778,145,39.641725,105.358275,1,0.0,LOW
3,78704,47158,23213,66262,91223,25815,1.826767,150,58.217143,91.782857,0,0.0,LOW
4,78705,33075,8370,15309,41357,10336,3.199981,47,163.547823,116.547823,0,1.0,MEDIUM
5,78721,12447,4197,41611,52873,4632,2.687176,198,176.612854,21.387146,0,1.0,MEDIUM
6,78722,7254,3111,69267,95443,3415,2.124158,156,109.339243,46.660757,1,0.0,LOW
7,78723,33170,12518,49606,71897,13777,2.407636,144,48.573243,95.426757,1,0.0,LOW
8,78724,24833,6360,45158,55252,6754,3.676784,44,,,0,,LOW
9,78725,7517,2544,62172,66225,2598,2.89338,171,,,2,,HIGH
