In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
#import xgboost as xgb

In [2]:
train_data = pd.read_csv('diamonds_train.csv')
test_data = pd.read_csv('diamonds_test.csv')

In [3]:
train_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95


In [4]:
train_data['volume'] = train_data['x'] * train_data['y'] * train_data['z']
test_data['volume'] = test_data['x'] * test_data['y'] * test_data['z']

In [5]:
train_data.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
volume     0
dtype: int64

In [6]:
train_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25,197.096725
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75,52.39575
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65,113.43689
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.0,66.2688
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95,168.429975


In [7]:
cortes = {'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1}
colores = {'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1}
claridad = {'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I1': 1}

In [8]:
train_data['cut'] = train_data['cut'].apply(lambda x: cortes[x])
test_data['cut'] = test_data['cut'].apply(lambda x: cortes[x])
train_data['color'] = train_data['color'].apply(lambda x: colores[x])
test_data['color'] = test_data['color'].apply(lambda x: colores[x])
train_data['clarity'] = train_data['clarity'].apply(lambda x: claridad[x])
test_data['clarity'] = test_data['clarity'].apply(lambda x: claridad[x])

In [9]:
train_data.clarity.value_counts()

3    9749
4    9248
2    6929
5    6066
6    3799
7    2774
8    1327
1     563
Name: clarity, dtype: int64

In [10]:
X = train_data[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'volume']]
Y = train_data['price']

In [11]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [12]:
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.25, random_state=42)

In [19]:
model_f = RandomForestRegressor(n_estimators=1000)

In [20]:
model_f.fit(trainX, trainY)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [21]:
predict_f = model_f.predict(testX)

In [22]:
model_f.score(testX, testY)

0.9817433038751202

In [23]:
test_df = test_data[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'volume']]
test_df = sc.transform(test_df)
final_predictions = model_f.predict(test_df)
test_data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,volume
0,0,0.79,3,5,3,62.7,60.0,5.82,5.89,3.67,125.806866
1,1,1.2,5,1,5,61.0,57.0,6.81,6.89,4.18,196.129362
2,2,1.57,4,3,3,62.2,61.0,7.38,7.32,4.57,246.878712
3,3,0.9,3,5,3,63.8,54.0,6.09,6.13,3.9,145.59363
4,4,0.5,3,5,5,62.9,58.0,5.05,5.09,3.19,81.997355


In [24]:
submission = pd.DataFrame({
        "id": test_data["id"],
        "price": final_predictions
    })
submission.to_csv('submission.csv', index=False)