## Regression Models

In [2]:
import sys
import pandas as pd
import numpy as np
from math import sqrt
import joblib
import seaborn as sns
from geopy.distance import distance
from math import cos, radians, sin, sqrt

from math import sqrt
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

from sklearn.svm import SVR

from sklearn.preprocessing import StandardScaler, normalize
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor

In [4]:
# Load DataSet:
dataset = joblib.load('medium_sample_0_w_features.joblib')

In [5]:
def validate_prediction(y, y_pred):
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)

    return "mse = {mse} & mae = {mae} & rmse = {rmse}".format(mse=mse, mae=mae, rmse=sqrt(mse))

In [6]:
X = dataset.iloc[:,2:]
y = dataset.iloc[:,1]
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.3, random_state=123)

## Linear Regression

In [6]:
LR_model = LinearRegression(normalize=False)
LR_model.fit(X_train, y_train)
y_pred = LR_model.predict(X_validate)
y_pred = LR_model.predict(X_validate)
validate_prediction(y_validate, y_pred)

'mse = 28.820740698449335 & mae = 2.46616601228351 & rmse = 5.368495198698546'

## Ridge Regression

In [39]:
ridge_model = Ridge(alpha=1e-4)
ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_validate)
validate_prediction(y_validate, y_pred)

  overwrite_a=True).T


'mse = 28.808505787905066 & mae = 2.466409254699667 & rmse = 5.367355567493648'

## Gradient Boosting 

In [45]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}

clf = GradientBoostingRegressor(**params)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_validate)
validate_prediction(y_validate, y_pred)

'mse = 13.572717818549428 & mae = 1.9110558158265978 & rmse = 3.684116965915907'

## Random Forest Regression

In [7]:
rf_m = RandomForestRegressor(max_features = 8)

grid_param = {
    "n_estimators": [60, 64, 70], 
    "max_depth": [5, 10, 13]}

gd_sr = GridSearchCV(estimator=rf_m, param_grid=grid_param)
gd_sr.fit(X_train, y_train)
y_pred = gd_sr.predict(X_validate)
validate_prediction(y_validate, y_pred)



'mse = 12.132785393394009 & mae = 1.7774125110428651 & rmse = 3.4832148072425864'

## KNN Regression

In [8]:
def apply_knn(x_train,y_train,x_test,neighbors):
    y_test_pred=[]
    y_train_pred=[]
    for k in neighbors:
        knn=KNeighborsRegressor(algorithm='auto',n_neighbors=k)
        knn.fit(x_train,y_train)
        y_test_pred.append(knn.predict(x_test))
        y_train_pred.append(knn.predict(x_train))
    return y_test_pred,y_train_pred

In [9]:
grid_param = {
    "n_neighbors": [1,3,5,7,10]}

knn=KNeighborsRegressor(algorithm='auto')
gd_sr = GridSearchCV(estimator=knn, param_grid=grid_param)
gd_sr.fit(X_train, y_train)
y_pred = gd_sr.predict(X_validate)
validate_prediction(y_validate, y_pred)



'mse = 14.762664350326814 & mae = 1.9738253004386563 & rmse = 3.842221278157573'