In [1]:
# setup the environment
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from random import randint
from sklearn import metrics

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [2]:
# input clean data
data = pd.read_csv('../clean_data/topfeatures4.csv')
data.head()

Unnamed: 0,carat_weight,total_sales_price,cut,color,clarity
0,0.09,200,9,1,7
1,0.09,200,9,1,7
2,0.09,200,9,1,7
3,0.09,200,9,1,7
4,0.09,200,9,1,7


In [3]:
# Assigning the featurs as X and trarget as y, we will use 4 features 4c
y = data['total_sales_price']
X = data.drop('total_sales_price', axis=1)

In [4]:
X

Unnamed: 0,carat_weight,cut,color,clarity
0,0.09,9,1,7
1,0.09,9,1,7
2,0.09,9,1,7
3,0.09,9,1,7
4,0.09,9,1,7
...,...,...,...,...
213129,10.04,5,10,4
213130,10.65,9,1,4
213131,5.17,8,10,5
213132,18.07,9,1,4


In [5]:
#checking target variable
pd.DataFrame(y)

Unnamed: 0,total_sales_price
0,200
1,200
2,200
3,200
4,200
...,...
213129,1161102
213130,1210692
213131,1292500
213132,1315496


In [7]:
pd.DataFrame(X)

Unnamed: 0,carat_weight,cut,color,clarity
0,0.09,9,1,7
1,0.09,9,1,7
2,0.09,9,1,7
3,0.09,9,1,7
4,0.09,9,1,7
...,...,...,...,...
213129,10.04,5,10,4
213130,10.65,9,1,4
213131,5.17,8,10,5
213132,18.07,9,1,4


In [8]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [9]:
# Building pipelins of standard scaler and model for various regressors.

pipeline_lr=Pipeline([("scalar1",StandardScaler()),
                     ("lr_classifier",LinearRegression())])

pipeline_dt=Pipeline([("scalar2",StandardScaler()),
                     ("dt_classifier",DecisionTreeRegressor())])

pipeline_rf=Pipeline([("scalar3",StandardScaler()),
                     ("rf_classifier",RandomForestRegressor())])


pipeline_kn=Pipeline([("scalar4",StandardScaler()),
                     ("rf_classifier",KNeighborsRegressor())])

# List of all the pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_kn]

# Dictionary of pipelines and model types for ease of reference
pipe_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest",3: "KNeighbors"}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [10]:
# Cross validation
cv_results_rms = []
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, X_train,y_train,scoring="neg_root_mean_squared_error", cv=10)
    cv_results_rms.append(cv_score)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))

LinearRegression: -16319.587869 
DecisionTree: -14939.750095 
RandomForest: -11046.751090 
KNeighbors: -11575.479072 


In [11]:
print(f"LinearRegression: {pipeline_lr.score(X_test, y_test)}")
print(f"DecisionTreeRegressor: {pipeline_dt.score(X_test, y_test)}")
print(f"RandomForestRegressor: {pipeline_rf.score(X_test, y_test)}")
print(f"KNeighborsRegressor: {pipeline_kn.score(X_test, y_test)}")

LinearRegression: 0.5616752908752245
DecisionTreeRegressor: 0.7694413254027737
RandomForestRegressor: 0.824746211970707
KNeighborsRegressor: 0.7474507874959646


### Testing the Model with the best score on the test set
### RandomForest appears to be the model with the best scoring on negative root mean square error. 

In [12]:
# Model prediction on test data
pred = pipeline_rf.predict(X_test)
pd.DataFrame(pred)

Unnamed: 0,0
0,811.296820
1,1602.599032
2,46725.473083
3,856.442686
4,797.910359
...,...
42622,1076.344667
42623,7362.749394
42624,2370.174500
42625,1349.162667


In [13]:
# Model Evaluation
print("R^2:",metrics.r2_score(y_test, pred))
print("Adjusted R^2:",1 - (1-metrics.r2_score(y_test, pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print("MAE:",metrics.mean_absolute_error(y_test, pred))
print("MSE:",metrics.mean_squared_error(y_test, pred))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, pred)))

R^2: 0.824746211970707
Adjusted R^2: 0.824729764709853
MAE: 1336.805123341481
MSE: 137276113.8854673
RMSE: 11716.488974324488


In [14]:
# save the model to disk
import pickle
filename = 'finalized_model_topfeatures4.sav'
pickle.dump(pipeline_rf, open(filename, 'wb'))