In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine
from sklearn import neighbors
from sklearn.model_selection import cross_val_score

data = pd.read_csv("amazon.csv", encoding = "ISO-8859–1") 

In [4]:
data.head()

Unnamed: 0,year,state,month,number,date
0,1998,Acre,Janeiro,0.0,1998-01-01
1,1999,Acre,Janeiro,0.0,1999-01-01
2,2000,Acre,Janeiro,0.0,2000-01-01
3,2001,Acre,Janeiro,0.0,2001-01-01
4,2002,Acre,Janeiro,0.0,2002-01-01


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6454 entries, 0 to 6453
Data columns (total 5 columns):
year      6454 non-null int64
state     6454 non-null object
month     6454 non-null object
number    6454 non-null float64
date      6454 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 176.5+ KB


In [6]:
data.describe(include = 'all')

Unnamed: 0,year,state,month,number,date
count,6454.0,6454,6454,6454.0,6454
unique,,23,12,,20
top,,Rio,Janeiro,,2011-01-01
freq,,717,541,,324
mean,2007.461729,,,108.293163,
std,5.746654,,,190.812242,
min,1998.0,,,0.0,
25%,2002.0,,,3.0,
50%,2007.0,,,24.0,
75%,2012.0,,,113.0,


In [7]:
month_list = list(data.month.unique())
for row in data.index:
    if data.iloc[row,2] in month_list:
        mon = data.iloc[row,2]
        data.iloc[row,2] = month_list.index(mon)

In [8]:
data.head()

Unnamed: 0,year,state,month,number,date
0,1998,Acre,0,0.0,1998-01-01
1,1999,Acre,0,0.0,1999-01-01
2,2000,Acre,0,0.0,2000-01-01
3,2001,Acre,0,0.0,2001-01-01
4,2002,Acre,0,0.0,2002-01-01


In [9]:
datadummies = pd.get_dummies(data['state'])
datadummies.head()

Unnamed: 0,Acre,Alagoas,Amapa,Amazonas,Bahia,Ceara,Distrito Federal,Espirito Santo,Goias,Maranhao,...,Pará,Pernambuco,Piau,Rio,Rondonia,Roraima,Santa Catarina,Sao Paulo,Sergipe,Tocantins
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
data = data.join(datadummies)
data = data.drop(columns = ['state', 'date'])
data.head()

Unnamed: 0,year,month,number,Acre,Alagoas,Amapa,Amazonas,Bahia,Ceara,Distrito Federal,...,Pará,Pernambuco,Piau,Rio,Rondonia,Roraima,Santa Catarina,Sao Paulo,Sergipe,Tocantins
0,1998,0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1999,0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2000,0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2001,0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2002,0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#col_list = list(data.columns)
#del col_list[0:3]
#for column in col_list:
    #data[column] = data[column] * data['month']

In [12]:
import statsmodels.api as sm
X = data.iloc[:,[0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]]
Y = data['number']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 202)
X = sm.add_constant(X.values)

print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

lrm = LinearRegression()
lrm.fit(X_train, y_train)

y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

The number of observations in training set is 5163
The number of observations in test set is 1291
R-squared of the model in the training set is: 0.11958699918210314
-----Test set statistics-----
R-squared of the model in the test set is: 0.09086349559385254
Mean absolute error of the prediction is: 113.38504448656359
Mean squared error of the prediction is: 29637.759651597287
Root mean squared error of the prediction is: 172.15620712480072
Mean absolute percentage error of the prediction is: inf


In [23]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(lrm, X_test, y_test, cv=5)
print("Unweighted Accuracy:(+/- %0.2f)" % ( score.std() * 2))

Unweighted Accuracy:(+/- 0.05)


In [14]:
from sklearn import neighbors
X = data.iloc[:,[0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]]
Y = data['number']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 202)
#X = sm.add_constant(X.values)

print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

knn = neighbors.KNeighborsRegressor(n_neighbors=10)
knn.fit(X_train, y_train)

y_preds_train = knn.predict(X_train)
y_preds_test = knn.predict(X_test)

print("R-squared of the model in the training set is: {}".format(knn.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

The number of observations in training set is 5163
The number of observations in test set is 1291
R-squared of the model in the training set is: 0.3268268516517582
-----Test set statistics-----
R-squared of the model in the test set is: 0.1880768132584849
Mean absolute error of the prediction is: 102.24852068164215
Mean squared error of the prediction is: 26468.61516128694
Root mean squared error of the prediction is: 162.69177963648605
Mean absolute percentage error of the prediction is: inf


In [24]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(knn, X_test, y_test, cv=5)
print("Unweighted Accuracy: (+/- %0.2f)" % ( score.std() * 2))

Unweighted Accuracy: (+/- 0.19)


In [16]:
from sklearn import neighbors
X = data.iloc[:,[0,1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]]
Y = data['number']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 202)
#X = sm.add_constant(X.values)

print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

knn_w = neighbors.KNeighborsRegressor(n_neighbors=10,weights='distance')
knn_w.fit(X_train, y_train)

y_preds_train = knn_w.predict(X_train)
y_preds_test = knn_w.predict(X_test)

print("R-squared of the model in the training set is: {}".format(knn_w.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(knn_w.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

The number of observations in training set is 5163
The number of observations in test set is 1291
R-squared of the model in the training set is: 0.9043632915465158
-----Test set statistics-----
R-squared of the model in the test set is: -0.0400366966439929
Mean absolute error of the prediction is: 110.20881708200174
Mean squared error of the prediction is: 33905.0929036344
Root mean squared error of the prediction is: 184.133356303616
Mean absolute percentage error of the prediction is: inf


In [25]:
score_w = cross_val_score(knn_w, X_test, y_test, cv=5)
print("Unweighted Accuracy:(+/- %0.2f)" % ( score_w.std() * 2))

Unweighted Accuracy:(+/- 0.25)


While it may be the most popular form of regression, OLS is not the only technique that can be used in regression modeling. KNN regression can be a powerful tool that has its uses. OLS Regression creates a mathematical formula that correlates features to the target variable. The coefficients imprint the features with importance. The higher the coefficient, the more impactful it is to the target variable. This creates a powerful model that can map out data's behavior powerfully and quickly. On the otherhand, KNN Regression predicts based on data that is 'close' or similar to it. Typically thought of as a classifier, KNN regression can create extremely accurate models based on the same principle. However, KNN regression can birth highly overfitted models. When data is fairly noisy, KNN regression will fall into the overfitting trap more often than an OLS model.  
  
  
When it comes to the training data, it is clear to see that the weighted KNN regression model is better than the unweighted KNN model and far superior than the OLS regression model. The R-squared value for the OLS is at a small 11.9%. For the test set, it drops to 9%. Just like the R-squared values, the variance is small as well. When looking at the unweighted KNN model, we see a decent improvement in the R-squared stats to 33% and 19%. The error statistics drop slightly as well. While the accuracy of the model does increase, we see that the variance does as well to +/- 0.19. This means that this model seems to be overfitting more-so than the last one. The final model shows the downside of KNN regression. Adding the weight to the function skyrockets the training set R-squared value. However, you can see that it performed poorly on the test set with a -0.04% value. This plus the variance shows the extreme overfitting that can occur within this type of model. While it may work best for the training set, this weighted KNN regression model is a poor choice. 