# Sklearn Decimal NoNorm

In [1]:
import pandas as pd
decimalDf = pd.read_csv('../decimal_nonorm_dataset.csv')

In [2]:
X = decimalDf[['WHITE',
            'BLACK_OR_AFRICAN_AMERICAN',
            'AMERICAN_INDIAN_AND_ALASKA_NATIVE',
            'ASIAN',
            'NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER',
            'HISPANIC_OR_LATINO'
            ]]
y = decimalDf['GrowthRate']

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import time

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=0)
models = {}
rmseValues = {}
trainingTimes = {}
predictionTimes = {}

In [4]:
def processModel(model, modelKey):
    # Train Model
    print('----- ----- -----')
    print('Training model', modelKey)
    trainingStartTime = time.time()
    trainedModel = model.fit(X_train, y_train)
    trainingTime = time.time() - trainingStartTime
    print('Finished training in ' + str(trainingTime) + 's')

    # Predict
    print('Predicting on model', modelKey)
    predictionStartTime = time.time()
    modelPredictions = trainedModel.predict(X_test)
    predictionTime = time.time() - predictionStartTime
    print('Finished predictions in ' + str(predictionTime) + 's')

    # Get Accuracy
    print('Getting RMSE on model', modelKey)
    rmse = root_mean_squared_error(y_test, modelPredictions)

    # Finished
    print()
    print('Finished processing model', modelKey)
    print('RMSE: ' + str(rmse))
    print('Training time: ' + str(trainingTime) + 's')
    print('Prediction time: ' + str(predictionTime) + 's')
    print('----- ----- -----')

    return (rmse, trainingTime, predictionTime)

In [5]:
# Set Model Properties
model = 'Linear Regression'
from sklearn.linear_model import LinearRegression
models[model] = LinearRegression()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Linear Regression
Finished training in 0.050477027893066406s
Predicting on model Linear Regression
Finished predictions in 0.0027823448181152344s
Getting RMSE on model Linear Regression

Finished processing model Linear Regression
RMSE: 30.047413614808296
Training time: 0.050477027893066406s
Prediction time: 0.0027823448181152344s
----- ----- -----


In [6]:
# Set Model Properties
model = 'Ridge Regression'
from sklearn.linear_model import Ridge
models[model] = Ridge()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Ridge Regression
Finished training in 0.01647806167602539s
Predicting on model Ridge Regression
Finished predictions in 0.001971006393432617s
Getting RMSE on model Ridge Regression

Finished processing model Ridge Regression
RMSE: 30.047413614806636
Training time: 0.01647806167602539s
Prediction time: 0.001971006393432617s
----- ----- -----


In [7]:
# Set Model Properties
model = 'Ridge Cross Validation Regression'
from sklearn.linear_model import RidgeCV
models[model] = RidgeCV()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Ridge Cross Validation Regression
Finished training in 0.13575100898742676s
Predicting on model Ridge Cross Validation Regression
Finished predictions in 0.002929210662841797s
Getting RMSE on model Ridge Cross Validation Regression

Finished processing model Ridge Cross Validation Regression
RMSE: 30.047413610206068
Training time: 0.13575100898742676s
Prediction time: 0.002929210662841797s
----- ----- -----


In [8]:
# Set Model Properties
model = 'Huber Regression (Outlier Robust)'
from sklearn.linear_model import HuberRegressor
models[model] = HuberRegressor()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Huber Regression (Outlier Robust)


Finished training in 2.042593002319336s
Predicting on model Huber Regression (Outlier Robust)
Finished predictions in 0.0016219615936279297s
Getting RMSE on model Huber Regression (Outlier Robust)

Finished processing model Huber Regression (Outlier Robust)
RMSE: 40.935314620920714
Training time: 2.042593002319336s
Prediction time: 0.0016219615936279297s
----- ----- -----


In [9]:
# Set Model Properties
model = 'Decision Tree Regression'
from sklearn.tree import DecisionTreeRegressor
models[model] = DecisionTreeRegressor()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Decision Tree Regression
Finished training in 2.4787368774414062s
Predicting on model Decision Tree Regression
Finished predictions in 0.0771017074584961s
Getting RMSE on model Decision Tree Regression

Finished processing model Decision Tree Regression
RMSE: 43.13518133651018
Training time: 2.4787368774414062s
Prediction time: 0.0771017074584961s
----- ----- -----


In [10]:
resultsDf = pd.DataFrame(index=models.keys(), columns=['RMSE', 'TrainingTime', 'PredictionTime'])
resultsDf['RMSE'] = rmseValues.values()
resultsDf['TrainingTime'] = trainingTimes.values()
resultsDf['PredictionTime'] = predictionTimes.values()
resultsDf = resultsDf.sort_values(by='RMSE', ascending=True)

resultsDf

Unnamed: 0,RMSE,TrainingTime,PredictionTime
Ridge Cross Validation Regression,30.047414,0.135751,0.002929
Ridge Regression,30.047414,0.016478,0.001971
Linear Regression,30.047414,0.050477,0.002782
Huber Regression (Outlier Robust),40.935315,2.042593,0.001622
Decision Tree Regression,43.135181,2.478737,0.077102
