# Sklearn Discrete MeanMaxNorm

In [1]:
import pandas as pd
discreteDf = pd.read_csv('../discrete_minmaxnorm_dataset.csv')

In [2]:
X = discreteDf[['WHITE',
            'BLACK_OR_AFRICAN_AMERICAN',
            'AMERICAN_INDIAN_AND_ALASKA_NATIVE',
            'ASIAN',
            'NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER',
            'HISPANIC_OR_LATINO'
            ]]
y = discreteDf['GrowthRate']

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import time

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=0)
models = {}
rmseValues = {}
trainingTimes = {}
predictionTimes = {}

In [4]:
def processModel(model, modelKey):
    # Train Model
    print('----- ----- -----')
    print('Training model', modelKey)
    trainingStartTime = time.time()
    trainedModel = model.fit(X_train, y_train)
    trainingTime = time.time() - trainingStartTime
    print('Finished training in ' + str(trainingTime) + 's')

    # Predict
    print('Predicting on model', modelKey)
    predictionStartTime = time.time()
    modelPredictions = trainedModel.predict(X_test)
    predictionTime = time.time() - predictionStartTime
    print('Finished predictions in ' + str(predictionTime) + 's')

    # Get Accuracy
    print('Getting RMSE on model', modelKey)
    rmse = root_mean_squared_error(y_test, modelPredictions)

    # Finished
    print()
    print('Finished processing model', modelKey)
    print('RMSE: ' + str(rmse))
    print('Training time: ' + str(trainingTime) + 's')
    print('Prediction time: ' + str(predictionTime) + 's')
    print('----- ----- -----')

    return (rmse, trainingTime, predictionTime)

In [5]:
# Set Model Properties
model = 'Linear Regression'
from sklearn.linear_model import LinearRegression
models[model] = LinearRegression()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Linear Regression
Finished training in 0.051481008529663086s
Predicting on model Linear Regression
Finished predictions in 0.0024912357330322266s
Getting RMSE on model Linear Regression

Finished processing model Linear Regression
RMSE: 0.022881842900936138
Training time: 0.051481008529663086s
Prediction time: 0.0024912357330322266s
----- ----- -----


In [6]:
# Set Model Properties
model = 'Ridge Regression'
from sklearn.linear_model import Ridge
models[model] = Ridge()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Ridge Regression
Finished training in 0.011728286743164062s
Predicting on model Ridge Regression
Finished predictions in 0.001772165298461914s
Getting RMSE on model Ridge Regression

Finished processing model Ridge Regression
RMSE: 0.02288129782806974
Training time: 0.011728286743164062s
Prediction time: 0.001772165298461914s
----- ----- -----


In [7]:
# Set Model Properties
model = 'Ridge Cross Validation Regression'
from sklearn.linear_model import RidgeCV
models[model] = RidgeCV()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Ridge Cross Validation Regression
Finished training in 0.08806180953979492s
Predicting on model Ridge Cross Validation Regression
Finished predictions in 0.0016129016876220703s
Getting RMSE on model Ridge Cross Validation Regression

Finished processing model Ridge Cross Validation Regression
RMSE: 0.022880803189170625
Training time: 0.08806180953979492s
Prediction time: 0.0016129016876220703s
----- ----- -----


In [8]:
# Set Model Properties
model = 'Huber Regression (Outlier Robust)'
from sklearn.linear_model import HuberRegressor
models[model] = HuberRegressor()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Huber Regression (Outlier Robust)
Finished training in 1.270254135131836s
Predicting on model Huber Regression (Outlier Robust)
Finished predictions in 0.001338958740234375s
Getting RMSE on model Huber Regression (Outlier Robust)

Finished processing model Huber Regression (Outlier Robust)
RMSE: 0.022921626446803102
Training time: 1.270254135131836s
Prediction time: 0.001338958740234375s
----- ----- -----


In [9]:
# Set Model Properties
model = 'Decision Tree Regression'
from sklearn.tree import DecisionTreeRegressor
models[model] = DecisionTreeRegressor()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Decision Tree Regression
Finished training in 2.0188839435577393s
Predicting on model Decision Tree Regression
Finished predictions in 0.06740808486938477s
Getting RMSE on model Decision Tree Regression

Finished processing model Decision Tree Regression
RMSE: 0.03179612646544581
Training time: 2.0188839435577393s
Prediction time: 0.06740808486938477s
----- ----- -----


In [10]:
resultsDf = pd.DataFrame(index=models.keys(), columns=['RMSE', 'TrainingTime', 'PredictionTime'])
resultsDf['RMSE'] = rmseValues.values()
resultsDf['TrainingTime'] = trainingTimes.values()
resultsDf['PredictionTime'] = predictionTimes.values()
resultsDf = resultsDf.sort_values(by='RMSE', ascending=True)

resultsDf

Unnamed: 0,RMSE,TrainingTime,PredictionTime
Ridge Cross Validation Regression,0.022881,0.088062,0.001613
Ridge Regression,0.022881,0.011728,0.001772
Linear Regression,0.022882,0.051481,0.002491
Huber Regression (Outlier Robust),0.022922,1.270254,0.001339
Decision Tree Regression,0.031796,2.018884,0.067408
