In [1]:
import numpy as np
import pandas as pd

import urllib.request
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import pickle

from catboost import CatBoostRegressor

In [2]:
df = pd.read_csv("data/diamonds_train.csv", index_col=0)
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,Ideal,H,VS2,63.0,57.0,6.73,6.70,4.23,6134
1,0.28,Very Good,D,VVS2,64.0,56.0,4.14,4.17,2.66,532
2,0.42,Premium,F,VS1,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,Ideal,H,IF,61.1,57.0,4.16,4.12,2.53,600
4,1.10,Good,G,SI1,63.4,57.0,6.52,6.55,4.14,4997
...,...,...,...,...,...,...,...,...,...,...
40340,1.55,Premium,H,VS2,61.3,61.0,7.46,7.39,4.55,11708
40341,0.36,Ideal,D,SI1,60.6,56.0,4.58,4.63,2.79,619
40342,0.57,Very Good,I,VS2,62.2,55.0,5.33,5.34,3.32,1267
40343,1.01,Very Good,F,IF,59.6,62.0,6.47,6.56,3.88,9965


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40345 entries, 0 to 40344
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40345 non-null  float64
 1   cut      40345 non-null  object 
 2   color    40345 non-null  object 
 3   clarity  40345 non-null  object 
 4   depth    40345 non-null  float64
 5   table    40345 non-null  float64
 6   x        40345 non-null  float64
 7   y        40345 non-null  float64
 8   z        40345 non-null  float64
 9   price    40345 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 3.4+ MB


In [5]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,Ideal,H,VS2,63.0,57.0,6.73,6.7,4.23,6134
1,0.28,Very Good,D,VVS2,64.0,56.0,4.14,4.17,2.66,532
2,0.42,Premium,F,VS1,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,Ideal,H,IF,61.1,57.0,4.16,4.12,2.53,600
4,1.1,Good,G,SI1,63.4,57.0,6.52,6.55,4.14,4997


## 1. Data transformation

### Cut property

In [6]:
df["cut"].unique()

array(['Ideal', 'Very Good', 'Premium', 'Good', 'Fair'], dtype=object)

In [7]:
# The cut order from high to low is: Ideal, Premium, very good, good, fair
# LabelEncoder assigns values by alphabetical order so instead a lambda is used to encode this column


df["cut"] = df["cut"].replace({"Ideal": 5, "Premium": 4, "Very Good": 3, "Good": 2, "Fair": 1})

### Color property

In [8]:
df["color"].unique()

array(['H', 'D', 'F', 'G', 'I', 'E', 'J'], dtype=object)

In [9]:
# A higher price is linked to the alphabetical order of the property 'color'. However, there is a big price difference beteween D and E, and F and G, so we'll increase the value difference between some colors before training the model

df["color"] = df["color"].replace({"D": 7, "E": 6, "F": 5, "G": 4, "H": 3, "I":2, "J":1})

### clarity 

In [10]:
df["clarity"].unique()

array(['VS2', 'VVS2', 'VS1', 'IF', 'SI1', 'SI2', 'VVS1', 'I1'],
      dtype=object)

In [11]:
# Order considering a higer to lower price: IF, VVS1, VVS2, VS1, VS2, SI1, SI2, I1
df["clarity"] = df["clarity"].replace({"I1": 1, "VVS1": 2, "SI2": 3, "SI1": 4, "IF": 5, "VS1":6, "VVS2":7, "VS2":8})

## 2. Defining X and y

In [12]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,5,3,8,63.0,57.0,6.73,6.7,4.23,6134
1,0.28,3,7,7,64.0,56.0,4.14,4.17,2.66,532
2,0.42,4,5,6,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,5,3,5,61.1,57.0,4.16,4.12,2.53,600
4,1.1,2,4,4,63.4,57.0,6.52,6.55,4.14,4997


In [14]:
# As cut will be used when training the model, we can ignore the table and depth as this column is based on those two

In [13]:
X = df.drop(["price"], 1)
y = df["price"]

In [14]:
X.shape

(40345, 9)

In [15]:
y.shape

(40345,)

## 3. Dividing X_train, X_test, y_train, y_test

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 4)

## 4. Model selection

### Ensembling

In [121]:
# Voting
from sklearn.ensemble import VotingClassifier

poly_clf = PolynomialRegression()
rnd_clf = RandomForestRegressor(n_estimators=100, random_state=4)
svr_clf = SVR()
cat_clf = CatBoostRegressor(random_state=42)

estimators = [("poly", poly_clf), ("forest", rnd_clf), ("svr", svr_clf), ("cat", cat_clf)]

voting_clf = VotingClassifier(estimators)
voting_clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

for clf in (poly_clf, rnd_clf, svr_clf, cat_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, np.sqrt(mean_squared_error(y_test, predictions)))


ValueError: The estimator Pipeline should be a classifier.

### Random Forest

In [22]:
pipe = Pipeline(steps=[
    ("classifier", RandomForestRegressor())
])

random_forest_params = {
    "classifier": [RandomForestRegressor()],
    "classifier__random_state": [4],
    "classifier__n_estimators": [100,500],
    "classifier__min_samples_split": [6,8,10],
    "classifier__min_samples_leaf": [1,2,3]
}

search_space = [random_forest_params]

clf = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 10,
                  scoring = "r2",
                  n_jobs=-1)

clf.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('classifier',
                                        RandomForestRegressor())]),
             n_jobs=-1,
             param_grid=[{'classifier': [RandomForestRegressor(min_samples_split=8,
                                                               n_estimators=500,
                                                               random_state=4)],
                          'classifier__min_samples_leaf': [1, 2, 3],
                          'classifier__min_samples_split': [6, 8, 10],
                          'classifier__n_estimators': [100, 500],
                          'classifier__random_state': [4]}],
             scoring='r2')

In [23]:
clf = GridSearchCV(estimator=pipe, param_grid=search_space)

In [24]:
%%time
# Fit grid search
best_model = clf.fit(X_train, y_train)
# View best model
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print("clf.best_params_", clf.best_params_)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
#SAVE MODEL
# save the model to disk
filename = 'model_random_forest_1.sav'
pickle.dump(best_model, open(filename, 'wb'))

best estimator: RandomForestRegressor(min_samples_split=8, n_estimators=500, random_state=4)
clf.best_params_ {'classifier': RandomForestRegressor(min_samples_split=8, n_estimators=500, random_state=4), 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 8, 'classifier__n_estimators': 500, 'classifier__random_state': 4}
clf.best_score 0.9803516637924055
Wall time: 54min 5s


### Polynominal 

In [107]:
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))

param_grid = {
    "polynomialfeatures__degree": np.arange(10),
    #"linearregression__fit_intercept": [True, False], 
    #"linearregression__normalize": [True, False]
}

poly_grid = GridSearchCV(PolynomialRegression(), 
                param_grid, 
                cv=10, 
                scoring="neg_mean_squared_error",
                verbose=1,
                n_jobs=-1)

clf_poly.fit(X_train, y_train)
clf_poly = GridSearchCV(PolynomialRegression(), param_grid)

In [108]:
%%time
# Fit grid search
best_model_poly = clf_poly.fit(X_train, y_train)
# View best model
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print("clf.best_params_", clf_poly.best_params_)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf_poly.best_score_)
#SAVE MODEL
# save the model to disk
filename = 'model_poly_1.sav'
pickle.dump(best_model_poly, open(filename, 'wb'))

Traceback (most recent call last):
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\pipeline.py", line 307, in _fit
    **fit_params_steps[name])
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python37\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\bas

### SVR

In [109]:
pipe = Pipeline(steps=[
    ("classifier", SVR)
])

svr_params = {
    "classifier": [SVR()],
    "classifier__kernel": ("linear", "rbf", "poly")
    #"classifier__C": [1, 10, 100, 1000],
    
}

clf_svr = GridSearchCV(estimator = pipe,
                  param_grid = svr_params,
                  cv = 10,
                  verbose=1,
                  n_jobs=-1)

clf_svr.fit(X_train, y_train)
clf_svr = GridSearchCV(estimator=pipe, param_grid=search_space)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


In [110]:
%%time
# Fit grid search
best_model_svr = clf_svr.fit(X_train, y_train)
# View best model
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print("clf.best_params_", clf_svr.best_params_)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf_svr.best_score_)
#SAVE MODEL
# save the model to disk
filename = 'model_svr_1.sav'
pickle.dump(best_model_svr, open(filename, 'wb'))

best estimator: LinearRegression()
clf.best_params_ {'classifier': LinearRegression()}
clf.best_score 0.8784415219930309
Wall time: 56 ms


### Catboost

In [111]:
pipe = Pipeline(steps=[
    ("classifier", CatBoostRegressor())
])

catboost_params = {
    "classifier": [CatBoostRegressor()],
    "classifier__iterations": [2, 4],
    "classifier__depth": [2,3],
    "classifier__learning_rate": [1,2]

}
                
clf_cat = GridSearchCV(estimator = pipe,
                  param_grid = catboost_params,
                  cv = 10,
                  verbose=1,
                  n_jobs=-1)

clf_cat.fit(X_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
0:	learn: 1705.4355993	total: 143ms	remaining: 429ms
1:	learn: 1493.3195033	total: 146ms	remaining: 146ms
2:	learn: 1377.5186870	total: 149ms	remaining: 49.5ms
3:	learn: 1291.8814477	total: 151ms	remaining: 0us


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('classifier',
                                        <catboost.core.CatBoostRegressor object at 0x00000236C9DE5148>)]),
             n_jobs=-1,
             param_grid={'classifier': [<catboost.core.CatBoostRegressor object at 0x00000236C9DE5348>],
                         'classifier__depth': [2, 3],
                         'classifier__iterations': [2, 4],
                         'classifier__learning_rate': [1, 2]},
             verbose=1)

In [112]:
%%time
# Fit grid search
best_model_cat = clf_cat.fit(X_train, y_train)
# View best model
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print("clf.best_params_", clf_cat.best_params_)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf_cat.best_score_)
#SAVE MODEL
# save the model to disk
filename = 'model_cat_1.sav'
pickle.dump(best_model_cat, open(filename, 'wb'))

Fitting 10 folds for each of 8 candidates, totalling 80 fits
0:	learn: 1705.4355993	total: 16.9ms	remaining: 50.7ms
1:	learn: 1493.3195033	total: 19.9ms	remaining: 19.9ms
2:	learn: 1377.5186870	total: 23.1ms	remaining: 7.71ms
3:	learn: 1291.8814477	total: 26.2ms	remaining: 0us
best estimator: LinearRegression()
clf.best_params_ {'classifier': <catboost.core.CatBoostRegressor object at 0x00000236C9DE5348>, 'classifier__depth': 3, 'classifier__iterations': 4, 'classifier__learning_rate': 1}
clf.best_score 0.9080833704257048
Wall time: 1.27 s


In [None]:
# The model with the best score is RandomForest, so we find its hyperparameters

In [25]:
with open("model_random_forest_1.sav","rb") as f:
    loaded_model = pickle.load(f)

## 5. Predictions with X_test

In [26]:
predictions = loaded_model.predict(X_test)
print(predictions)

[ 806.50372157 8600.00738311 1200.35565807 ... 3209.63049135 1098.53669906
  733.32505205]


## 6. MAE

In [27]:
np.sqrt(mean_squared_error(y_test, predictions))

554.7035419623857

## 7. Train the model with all available data

In [34]:
X_pred = pd.read_csv("data/diamonds_test.csv", index_col = 0)
X_pred.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.3,Ideal,H,SI2,60.0,56.0,4.41,4.43,2.65
1,0.34,Ideal,D,IF,62.1,57.0,4.52,4.46,2.79
2,1.57,Very Good,I,VS2,60.3,58.0,7.58,7.55,4.56
3,0.31,Ideal,H,VS2,61.8,57.0,4.32,4.36,2.68
4,1.51,Good,I,VVS1,64.0,60.0,7.26,7.21,4.63


In [41]:
# Same transformations than before to X_pred are carried out

X_pred["cut"] = X_pred["cut"].replace({"Ideal": 5, "Premium": 4, "Very Good": 3, "Good": 2, "Fair": 1})

X_pred["cut"] = X_pred["cut"]*2

X_pred["color"] = X_pred["color"].replace({"D": 13, "E": 9, "F": 8, "G": 4, "H": 3, "I":2, "J":1})

X_pred["clarity"] = X_pred["clarity"].replace({"I1": 1, "VVS1": 2, "SI2": 3, "SI1": 4, "IF": 5, "VS1":6, "VVS2":7, "VS2":8})

X_pred = X_pred.drop(["depth", "table"], 1)

## 8. Prediction with all data

In [44]:
predictions_submit = best_model.predict(X_pred)
predictions_submit

array([ 7786.07089318,  5735.87015654, 16954.72889085, ...,
       19464.847141  , 14022.0573367 ,  9946.39476437])

## **Submission to Kaggle**


In [45]:
sample = pd.read_csv("data/sample_submission.csv")

In [46]:
sample.head()

Unnamed: 0,id,price
0,0,12132
1,1,11786
2,2,14684
3,3,15425
4,4,6724


In [47]:
submission = pd.DataFrame({"id": range(len(predictions_submit)), "price": predictions_submit})
submission

Unnamed: 0,id,price
0,0,7786.070893
1,1,5735.870157
2,2,16954.728891
3,3,8971.866832
4,4,11586.665944
...,...,...
13444,13444,12057.971201
13445,13445,6602.209372
13446,13446,19464.847141
13447,13447,14022.057337


## 5. Pásale el CHEQUEATOR para comprobar que efectivamente está listo para subir a Kaggle.


In [48]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                submission.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")

In [49]:
chequeator(submission)

You're ready to submit!
