# **Diamonds price prediction**

In [None]:
import numpy as np
import pandas as pd


import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [None]:
diamonds = pd.read_csv('../input/diamonds/diamonds_train.csv')
diamonds_predict = pd.read_csv('../input/diamonds/diamonds_predict.csv')

In [None]:
# Some dimensions are 0. 

x_filter = diamonds['x']==0
y_filter = diamonds['y']==0
z_filter = diamonds['z']==0

diamonds.drop(diamonds[(x_filter | y_filter | z_filter)].index, inplace=True)

In [None]:
diamonds[diamonds['y']==diamonds['y'].max()]
diamonds.drop(14708, inplace=True)
diamonds.drop([31083, 27893,3468,23609,2884], inplace=True)

In [None]:
# Two synthetic variables and category change
diamonds['sintetica'] = diamonds['depth'] / diamonds['table'] * diamonds['carat']
diamonds['volumen'] = diamonds['x'] * diamonds['y'] * diamonds['z']
diamonds[['cut', 'clarity', 'color']] = diamonds[['cut', 'clarity', 'color']].astype('category')

In [None]:
diamonds['carat'].value_counts()[1.21]

In [None]:
diamonds['carat'].value_counts()

In [None]:
# Bin division based on number of diamonds per carat
diamonds['bins'] = [diamonds['carat'].value_counts()[x] for x in diamonds['carat']]

In [None]:
diamonds['bins'] = diamonds['bins'].astype('category')

In [None]:
diamonds.dtypes

In [None]:
NUM_FEATS = ['carat', 'depth', 'table','sintetica', 'volumen'] 
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'bins'

In [None]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', RobustScaler())])

categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])

In [None]:
# Using a tree classifier to create a new tag for each diamond

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', DecisionTreeClassifier())])

In [None]:
param_grid = {
#     'preprocessor__num__imputer__strategy': ['mean', 'median'], #le podemos apsar parámetros del modelo y del preprocesamiento
#     'ccp_alpha': [], 
#     'class_weight': [],
    'regressor__criterion': ['entropy'],
    'regressor__max_depth': [None],
    'regressor__max_features': [None],
    'regressor__max_leaf_nodes': [None],
    'regressor__min_impurity_decrease': [0.0],
    'regressor__min_samples_leaf': [1],
    'regressor__min_samples_split': [2],
    'regressor__min_weight_fraction_leaf': [0.0],
    'regressor__random_state': [None], 
    'regressor__splitter': ['best', 'random']
}

In [None]:
grid_search = GridSearchCV(model, param_grid,
                           scoring=None,
                           n_jobs=-1,
                           refit=True,
                           cv=5,
                           verbose=10,
                           pre_dispatch='2*n_jobs',
                           error_score=np.nan,
                           return_train_score=True)

In [None]:
grid_search.fit(diamonds[FEATS], diamonds[TARGET])

In [None]:
grid_search.best_score_

In [None]:
scores = cross_val_score(model, 
                         diamonds[FEATS], 
                         diamonds[TARGET], 
                         cv=5, n_jobs=-1, verbose=10)

In [None]:
scores

In [None]:
np.mean(scores)

**AÑADIMOS LA PREDICCÓN DE LA ETIQUETA AL DIAMONDS PREDICT**

In [None]:
diamonds_predict['sintetica'] = diamonds_predict['depth'] / diamonds_predict['table'] * diamonds_predict['carat']
diamonds_predict['volumen'] = diamonds_predict['x'] * diamonds_predict['y'] * diamonds_predict['z']
diamonds_predict[['cut', 'clarity', 'color']] = diamonds_predict[['cut', 'clarity', 'color']].astype('category')

In [None]:
y_pred = grid_search.predict(diamonds_predict[FEATS])
bins_pred_df = pd.DataFrame({'id': diamonds_predict['id'], 'bins': y_pred})

In [None]:
diamonds_predict_with_bins = diamonds_predict
diamonds_predict_with_bins['bins'] = bins_pred_df['bins'].astype('int64') ################ <-------- ojo!!!! ponemos como int para probarla como feat numerica

In [None]:
diamonds_predict_with_bins.set_index('id', inplace=True)

In [None]:
diamonds_predict_with_bins.head()

**AHORA EL REGRESSOR**

In [None]:
# Using LGBM for regression on the price feature

In [None]:
### pasamos bins a int para probarla como ft numérica, tb se ha hecho arriba para diamonds_predict, OJO!!!

diamonds['bins'] = diamonds['bins'].astype('int64')

In [None]:
NUM_FEATS = ['carat', 'sintetica', 'depth', 'volumen', 'bins'] # 'depth', 'table',
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

In [None]:
diamonds_train, diamonds_test = train_test_split(diamonds)

In [None]:
gbm = lgb.LGBMRegressor(task='predict', # OK
                    boosting_type= 'gbdt', #dart 
                    objective= 'regression', # OK(regression) # huber una mierda #MAE
                    metric='mae',  # OK(rmse)
                    learning_rate= 0.002,  # OK(0.002)
                    feature_fraction= 0.9, # OK 
                    bagging_fraction= 0.9, # OK
                    bagging_freq= 20, # OK (20 tb OK)
                    verbose= 0, # OK
                    max_depth= 50,  # OK 7
                    num_leaves= 20, # OK 10 20
                    min_data_in_leaf= 1, #OK
                    max_bin= 1973,#OK 256 512 1024
                    num_iterations= 25000, #15000
                    n_estimators= 512, #OK 1024
                    extra_trees= True, # OK(True) 
                    path_smooth = 0.1,    
                    n_jobs=-1)

In [None]:
model = gbm.fit(diamonds_train[FEATS], diamonds_train[TARGET],      # diamonds_train[FEATS], diamonds_train[TARGET] <------------------------------------
        eval_set=[(diamonds_test[FEATS], diamonds_test[TARGET])],
        categorical_feature=['cut', 'color', 'clarity'],
        eval_metric='rmse',
        early_stopping_rounds=10)

In [None]:
model.best_score_['valid_0']['rmse']

In [None]:
scores = cross_val_score(model, 
                         diamonds[FEATS], 
                         diamonds[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1, verbose=10)

In [None]:
scores

array([-533.37749846, -529.57757024, -545.78089278, -558.67492903,
       -527.68610018])

In [None]:
 np.mean(-scores)

In [None]:
model.best_score_['valid_0']['rmse'] -  np.mean(-scores)

**SUBMISSION**

In [None]:
y_pred = model.predict(diamonds_predict_with_bins[FEATS])
submission_df = pd.DataFrame({'id': diamonds_predict_with_bins.index, 'price': y_pred})

In [None]:
submission_df

In [None]:
submission_df.describe()

In [None]:
submission_df.to_csv('diamonds_rf.csv', index=False)