In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
#import xgboost as xgb

In [2]:
train_data = pd.read_csv('diamonds_train.csv')
test_data = pd.read_csv('diamonds_test.csv')

In [3]:
train_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


In [4]:
train_data['volume'] = train_data['x'] * train_data['y'] * train_data['z']
test_data['volume'] = test_data['x'] * test_data['y'] * test_data['z']

In [5]:
train_data.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
volume     0
dtype: int64

In [6]:
train_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,197.096725
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,52.39575
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,113.43689
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,66.2688
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,168.429975


In [7]:
mean=train_data.volume.mean()
train_data=train_data.replace({'volume': {0: mean}}) 
test_data=test_data.replace({'volume': {0: mean}})

In [8]:
cortes = {'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1}
colores = {'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1}
claridad = {'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I1': 1}

In [9]:
train_data['cut'] = train_data['cut'].apply(lambda x: cortes[x])
test_data['cut'] = test_data['cut'].apply(lambda x: cortes[x])
train_data['color'] = train_data['color'].apply(lambda x: colores[x])
test_data['color'] = test_data['color'].apply(lambda x: colores[x])
train_data['clarity'] = train_data['clarity'].apply(lambda x: claridad[x])
test_data['clarity'] = test_data['clarity'].apply(lambda x: claridad[x])

In [24]:
X = train_data[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]
Y = train_data['price']

In [102]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [96]:
'''
def numEstimator():
    rmse_lst=[]
    for i in range(2000, 3000, 100):
        model_f = GradientBoostingRegressor(n_estimators=i, max_depth=5, learning_rate=0.01, loss="ls")
        model_f.fit(X, Y)
        predict_f = model_f.predict(X)
        r2=model_f.score(X, Y)
        rmse=mean_squared_error(predict_f, testY)**0.5
        rmse_lst.append(rmse)
    plt.plot(rmse_lst)
    return plt.show()
'''

'\ndef numEstimator():\n    rmse_lst=[]\n    for i in range(2000, 3000, 100):\n        model_f = GradientBoostingRegressor(n_estimators=i, max_depth=5, learning_rate=0.01, loss="ls")\n        model_f.fit(X, Y)\n        predict_f = model_f.predict(X)\n        r2=model_f.score(X, Y)\n        rmse=mean_squared_error(predict_f, testY)**0.5\n        rmse_lst.append(rmse)\n    plt.plot(rmse_lst)\n    return plt.show()\n'

In [97]:
model_f = GradientBoostingRegressor(n_estimators=1300, max_depth=8, learning_rate=0.9, 
                                     random_state =42)

In [98]:
model_f.fit(X,Y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.9, loss='ls', max_depth=8,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1300,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [99]:
predict_f = model_f.predict(X)

In [100]:
r2=model_f.score(X, Y)
print(r2)

0.9999951577644166


In [101]:
rmse=mean_squared_error(predict_f, Y)**0.5
print(rmse)

8.785235201286756


In [19]:
test_df = test_data[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']]
test_df = sc.transform(test_df)
final_predictions = model_f.predict(test_df)

In [20]:
submission = pd.DataFrame({"id": test_data["id"], "price": final_predictions})
submission.to_csv('submission.csv', index=False)