# Sklearn Discrete MeanNorm

In [1]:
import pandas as pd
discreteDf = pd.read_csv('../discrete_meannorm_dataset.csv')

In [2]:
X = discreteDf[['WHITE',
            'BLACK_OR_AFRICAN_AMERICAN',
            'AMERICAN_INDIAN_AND_ALASKA_NATIVE',
            'ASIAN',
            'NATIVE_HAWAIIAN_AND_OTHER_PACIFIC_ISLANDER',
            'HISPANIC_OR_LATINO'
            ]]
y = discreteDf['GrowthRate']

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import time

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.25, random_state=0)
models = {}
rmseValues = {}
trainingTimes = {}
predictionTimes = {}

In [4]:
def processModel(model, modelKey):
    # Train Model
    print('----- ----- -----')
    print('Training model', modelKey)
    trainingStartTime = time.time()
    trainedModel = model.fit(X_train, y_train)
    trainingTime = time.time() - trainingStartTime
    print('Finished training in ' + str(trainingTime) + 's')

    # Predict
    print('Predicting on model', modelKey)
    predictionStartTime = time.time()
    modelPredictions = trainedModel.predict(X_test)
    predictionTime = time.time() - predictionStartTime
    print('Finished predictions in ' + str(predictionTime) + 's')

    # Get Accuracy
    print('Getting RMSE on model', modelKey)
    rmse = root_mean_squared_error(y_test, modelPredictions)

    # Finished
    print()
    print('Finished processing model', modelKey)
    print('RMSE: ' + str(rmse))
    print('Training time: ' + str(trainingTime) + 's')
    print('Prediction time: ' + str(predictionTime) + 's')
    print('----- ----- -----')

    return (rmse, trainingTime, predictionTime)

In [5]:
# Set Model Properties
model = 'Linear Regression'
from sklearn.linear_model import LinearRegression
models[model] = LinearRegression()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Linear Regression
Finished training in 0.034026145935058594s
Predicting on model Linear Regression
Finished predictions in 0.0018110275268554688s
Getting RMSE on model Linear Regression

Finished processing model Linear Regression
RMSE: 1.0316379325980412
Training time: 0.034026145935058594s
Prediction time: 0.0018110275268554688s
----- ----- -----


In [6]:
# Set Model Properties
model = 'Ridge Regression'
from sklearn.linear_model import Ridge
models[model] = Ridge()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Ridge Regression
Finished training in 0.012167215347290039s
Predicting on model Ridge Regression
Finished predictions in 0.0020821094512939453s
Getting RMSE on model Ridge Regression

Finished processing model Ridge Regression
RMSE: 1.031637929731825
Training time: 0.012167215347290039s
Prediction time: 0.0020821094512939453s
----- ----- -----


In [7]:
# Set Model Properties
model = 'Ridge Cross Validation Regression'
from sklearn.linear_model import RidgeCV
models[model] = RidgeCV()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Ridge Cross Validation Regression
Finished training in 0.0937349796295166s
Predicting on model Ridge Cross Validation Regression
Finished predictions in 0.0019392967224121094s
Getting RMSE on model Ridge Cross Validation Regression

Finished processing model Ridge Cross Validation Regression
RMSE: 1.0316379039540116
Training time: 0.0937349796295166s
Prediction time: 0.0019392967224121094s
----- ----- -----


In [8]:
# Set Model Properties
model = 'Huber Regression (Outlier Robust)'
from sklearn.linear_model import HuberRegressor
models[model] = HuberRegressor()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Huber Regression (Outlier Robust)
Finished training in 0.49686694145202637s
Predicting on model Huber Regression (Outlier Robust)
Finished predictions in 0.0013599395751953125s
Getting RMSE on model Huber Regression (Outlier Robust)

Finished processing model Huber Regression (Outlier Robust)
RMSE: 1.4055402601591849
Training time: 0.49686694145202637s
Prediction time: 0.0013599395751953125s
----- ----- -----


In [9]:
# Set Model Properties
model = 'Decision Tree Regression'
from sklearn.tree import DecisionTreeRegressor
models[model] = DecisionTreeRegressor()
rmseValues[model], trainingTimes[model], predictionTimes[model] = processModel(models[model], model)

----- ----- -----
Training model Decision Tree Regression
Finished training in 2.247148036956787s
Predicting on model Decision Tree Regression
Finished predictions in 0.05610179901123047s
Getting RMSE on model Decision Tree Regression

Finished processing model Decision Tree Regression
RMSE: 1.46698549417668
Training time: 2.247148036956787s
Prediction time: 0.05610179901123047s
----- ----- -----


In [10]:
resultsDf = pd.DataFrame(index=models.keys(), columns=['RMSE', 'TrainingTime', 'PredictionTime'])
resultsDf['RMSE'] = rmseValues.values()
resultsDf['TrainingTime'] = trainingTimes.values()
resultsDf['PredictionTime'] = predictionTimes.values()
resultsDf = resultsDf.sort_values(by='RMSE', ascending=True)

resultsDf

Unnamed: 0,RMSE,TrainingTime,PredictionTime
Ridge Cross Validation Regression,1.031638,0.093735,0.001939
Ridge Regression,1.031638,0.012167,0.002082
Linear Regression,1.031638,0.034026,0.001811
Huber Regression (Outlier Robust),1.40554,0.496867,0.00136
Decision Tree Regression,1.466985,2.247148,0.056102
