In [433]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy import stats
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor

In [434]:
joined_zips_grb_df = pd.read_csv("zips_load_demographics.csv")
all_zips_df = pd.read_csv("full_zip_codes.csv")

In [435]:
all_zips_df.head()

Unnamed: 0,ZIPCODE,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS
0,78701,7875,4855,112201,173289,6576
1,78702,22876,8937,54053,73919,9839
2,78703,20788,9723,98553,166396,11047
3,78704,47158,23213,66262,91223,25815
4,78705,33075,8370,15309,41357,10336


In [436]:
len(all_zips_df)

45

In [437]:
len(joined_zips_grb_df)

40

In [438]:
joined_zips_grb_df.head()

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0
1,78652,908280.0,,,,,
2,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0
3,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0
4,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0


In [439]:
joined_zips_grb_df = joined_zips_grb_df.dropna()
joined_zips_grb_df.head()


Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0
2,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0
3,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0
4,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0
5,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0


In [440]:
len(joined_zips_grb_df)

37

In [441]:
joined_zips_grb_df["POPULATION_PER_UNIT"] = joined_zips_grb_df.TOTAL_POPULATION/joined_zips_grb_df.TOTAL_HOUSING_UNITS
joined_zips_grb_df.head()

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463
2,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622
3,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536
4,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033
5,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778


In [442]:
joined_zips_grb_df["LOAD_PER_CAPITA"] = joined_zips_grb_df.load_weight/joined_zips_grb_df.TOTAL_POPULATION
joined_zips_grb_df.head()

Unnamed: 0,ZIP,load_weight,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,LOAD_PER_CAPITA
0,78617,1325600.0,27900.0,6520.0,52819.0,69936.0,7244.0,3.851463,47.512545
2,78653,825420.0,20822.0,6332.0,66752.0,75678.0,6891.0,3.021622,39.641725
3,78701,458460.0,7875.0,4855.0,112201.0,173289.0,6576.0,1.197536,58.217143
4,78702,3741320.0,22876.0,8937.0,54053.0,73919.0,9839.0,2.325033,163.547823
5,78703,3671428.0,20788.0,9723.0,98553.0,166396.0,11047.0,1.881778,176.612854


In [443]:
features = ['TOTAL_POPULATION', 'TOTAL_HOUSEHOLDS', 'MEDIAN_HOUSEHOLD_INCOME', 'TOTAL_HOUSING_UNITS', 'POPULATION_PER_UNIT']
# Separating out the features
x = joined_zips_grb_df.loc[:, features]
# Separating out the target
#y = joined_zips_grb_df.loc[:,['load_weight']]
# Standardizing the features
ss = StandardScaler().fit(x)
X = ss.transform(x)

In [444]:
# Apply principal component analysis
pca = PCA(n_components=3)
pca.fit(X)
#principalComponents = pca.fit(x)
#PCA_df = pd.DataFrame(data = principalComponents, columns = ['PRINCIPAL_COMPONENT_1', 'PRINCIPAL_COMPONENT_2'])

PCA(n_components=3)

In [445]:
pca.components_

array([[ 0.55593522,  0.56556387, -0.21785227,  0.56580689, -0.05896181],
       [ 0.11745993, -0.1295423 , -0.60106462, -0.13735661,  0.76764343],
       [ 0.22954666,  0.08363969,  0.76852846,  0.04817768,  0.58936886]])

In [446]:
pca.singular_values_

array([10.59298364,  6.90135247,  4.94913936])

In [447]:
pd.DataFrame(pca.components_, columns=features)

Unnamed: 0,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT
0,0.555935,0.565564,-0.217852,0.565807,-0.058962
1,0.11746,-0.129542,-0.601065,-0.137357,0.767643
2,0.229547,0.08364,0.768528,0.048178,0.589369


In [448]:
X_reduced = pca.transform(X)

In [449]:
X_reduced

array([[-0.73158994,  2.60376743,  0.92002373],
       [-1.06675618,  1.08120218,  0.29680464],
       [-1.88215531, -2.55756196, -0.62736463],
       [-0.31556893,  0.27562414, -0.72562083],
       [-0.52273962, -1.38660683,  0.03127252],
       [ 3.26401178, -1.14280248, -0.23185058],
       [ 0.27202382,  2.44481355, -0.71134065],
       [-1.53917529,  1.18774575, -0.9480121 ],
       [-2.09990307, -0.20735542, -0.88592741],
       [ 0.77166421,  0.41275853, -0.52294738],
       [-0.8242857 ,  2.51480006,  0.46202719],
       [-2.24036205,  1.06590475, -0.26114191],
       [-1.55819984, -0.23382654,  0.15401188],
       [ 0.51458134, -0.66143755, -0.25112066],
       [-0.18435257, -0.2710019 , -1.03964945],
       [ 0.46895903, -0.48286348, -0.19207015],
       [-2.25688392, -0.94712793,  0.36834957],
       [ 0.25963362, -1.01854877,  0.09387302],
       [-1.17971632, -0.94553709,  0.37902506],
       [-2.4247659 , -0.53250698,  0.58561502],
       [-1.78073519, -0.54804291,  2.671

In [450]:
knn = KNeighborsRegressor(n_neighbors=10)

In [451]:
all_zips_df["POPULATION_PER_UNIT"] = all_zips_df.TOTAL_POPULATION/all_zips_df.TOTAL_HOUSING_UNITS
all_zips_df.head()

Unnamed: 0,ZIPCODE,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT
0,78701,7875,4855,112201,173289,6576,1.197536
1,78702,22876,8937,54053,73919,9839,2.325033
2,78703,20788,9723,98553,166396,11047,1.881778
3,78704,47158,23213,66262,91223,25815,1.826767
4,78705,33075,8370,15309,41357,10336,3.199981


In [452]:
T = all_zips_df.loc[:, features]

In [453]:
T_reduced = pca.transform(ss.transform(T))

In [454]:
# Split data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X_reduced,joined_zips_grb_df["LOAD_PER_CAPITA"],random_state=1)

In [455]:
model = knn.fit(X_train, y_train)

In [456]:
y_pred = model.predict(X_test)

In [457]:
score = mean_absolute_error(y_pred,y_test)

In [458]:
print(score)

57.24440053882357


In [459]:
percent_error = ((abs(y_pred - y_test)/abs(y_test)).mean())*100.

In [460]:
print(percent_error)

62.170732340163134


In [461]:
pca.explained_variance_ratio_

array([0.60654758, 0.25745225, 0.13239989])

In [462]:
all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'] = model.predict(T_reduced)

In [463]:
all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'] = all_zips_df['PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)'].astype(int)

In [464]:
all_zips_df

Unnamed: 0,ZIPCODE,TOTAL_POPULATION,TOTAL_HOUSEHOLDS,MEDIAN_HOUSEHOLD_INCOME,MEAN_HOUSEHOLD_INCOME,TOTAL_HOUSING_UNITS,POPULATION_PER_UNIT,PREDICTED_TRASH_OUTPUT_PER_CAPITA (TONS)
0,78701,7875,4855,112201,173289,6576,1.197536,127
1,78702,22876,8937,54053,73919,9839,2.325033,110
2,78703,20788,9723,98553,166396,11047,1.881778,117
3,78704,47158,23213,66262,91223,25815,1.826767,130
4,78705,33075,8370,15309,41357,10336,3.199981,98
5,78721,12447,4197,41611,52873,4632,2.687176,136
6,78722,7254,3111,69267,95443,3415,2.124158,132
7,78723,33170,12518,49606,71897,13777,2.407636,127
8,78724,24833,6360,45158,55252,6754,3.676784,124
9,78725,7517,2544,62172,66225,2598,2.89338,140
