In [8]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [9]:
df = pd.read_csv("./diamonds/data.csv")
df2 = pd.read_csv("./diamonds/test.csv")

In [10]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,2.26,Ideal,G,SI2,61.9,57.0,8.44,8.36,5.20,12831
1,2.43,Very Good,H,SI2,63.2,57.0,8.56,8.50,5.39,16170
2,0.80,Premium,F,SI2,61.0,57.0,6.03,6.01,3.67,2797
3,0.40,Ideal,F,I1,63.3,60.0,4.68,4.64,2.95,630
4,0.31,Ideal,G,VS2,61.6,55.0,4.39,4.37,2.70,698
...,...,...,...,...,...,...,...,...,...,...
40450,1.11,Premium,H,SI1,62.8,61.0,6.63,6.56,4.14,5315
40451,0.73,Ideal,F,VS2,62.6,56.0,5.77,5.74,3.60,2762
40452,1.26,Very Good,I,VS1,59.2,60.0,7.09,7.02,4.18,6855
40453,0.72,Ideal,G,SI2,61.4,56.0,5.76,5.83,3.56,2297


In [13]:
df = pd.get_dummies(df)
df2 = pd.get_dummies(df2)

In [14]:
df.columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'cut_Fair',
       'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D',
       'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J',
       'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2'],
      dtype='object')

In [15]:
X = df[['carat', 'depth', 'table', 'x', 'y', 'z', 'cut_Fair',
       'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D',
       'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J',
       'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1',
       'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']]
y = df['price']

In [16]:
models = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "RandomForestRegressor": RandomForestRegressor()
}

for modelName, model in models.items():
    print(f"Training model: {modelName}")
    model.fit(X, y)

Training model: LinearRegression
Training model: DecisionTreeRegressor
Training model: KNeighborsRegressor
Training model: GradientBoostingRegressor
Training model: RandomForestRegressor




In [18]:
d = {modelName:model.predict(X) for modelName, model in models.items()}


df = pd.DataFrame(d)
df["gt"] = y.reset_index(drop=True)
df

Unnamed: 0,LinearRegression,DecisionTreeRegressor,KNeighborsRegressor,GradientBoostingRegressor,RandomForestRegressor,gt
0,16557.964077,12831.0,15016.6,15842.267050,13775.8,12831
1,17657.731689,16170.0,13877.8,15423.688326,16003.0,16170
2,2761.016962,2797.0,2661.6,2762.866964,2795.3,2797
3,-3196.317795,630.0,731.2,221.130987,718.9,630
4,383.214890,698.0,649.0,791.572969,702.3,698
...,...,...,...,...,...,...
40450,5679.912046,5315.0,4943.6,4952.895801,5189.9,5315
40451,3807.613827,2762.0,2759.4,3097.960363,2848.5,2762
40452,7539.648581,6855.0,7202.4,7101.929944,6580.1,6855
40453,2008.660191,2297.0,2481.6,2418.006630,2374.9,2297


In [10]:
reg = RandomForestRegressor(n_estimators=1000).fit(X, y)

In [12]:
predicted = reg.predict(X)

In [14]:
r2_score(y, predicted)

0.9974493113739823

In [16]:
sqrt(mean_squared_error(y, predicted))

202.03225486919905

In [19]:
df2.drop("id", axis=1, inplace=True)

In [20]:
predicted = reg.predict(df2)

In [21]:
predicted

array([ 1334.309,  6386.424,  1720.663, ..., 11223.286,   501.127,
        1234.057])

In [22]:
submit = pd.DataFrame(predicted)

In [25]:
submit.reset_index(inplace=True)

In [29]:
submit.rename(columns={"index":"id", 0:"price"}, inplace=True)

In [30]:
submit

Unnamed: 0,id,price
0,0,1334.309
1,1,6386.424
2,2,1720.663
3,3,4062.387
4,4,1744.778
...,...,...
13480,13480,2299.859
13481,13481,2127.253
13482,13482,11223.286
13483,13483,501.127


In [31]:
submit.set_index("id", inplace=True)

In [34]:
submit.to_csv("./predictions.csv")