In [21]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [3]:
diamonds_dummy = pd.read_csv('./data/diamonds_dummy.csv')

In [4]:
diamonds_dummy.head()

Unnamed: 0,price,carat,table,x,y,z,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,...,color_G,color_H,color_I,color_J,clarity_SI,clarity_VSI,clarity_VVSI,continent_America,continent_Asia,continent_Europe
0,4268,1.21,58.0,6.83,6.79,4.25,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,505,0.32,57.0,4.35,4.38,2.75,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2686,0.71,55.0,5.62,5.53,3.65,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,738,0.41,56.0,4.68,4.72,3.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,4882,1.02,59.0,6.55,6.51,3.95,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [5]:
diamonds_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   price              40455 non-null  int64  
 1   carat              40455 non-null  float64
 2   table              40455 non-null  float64
 3   x                  40455 non-null  float64
 4   y                  40455 non-null  float64
 5   z                  40455 non-null  float64
 6   cut_Good           40455 non-null  float64
 7   cut_Ideal          40455 non-null  float64
 8   cut_Premium        40455 non-null  float64
 9   cut_Very Good      40455 non-null  float64
 10  color_E            40455 non-null  float64
 11  color_F            40455 non-null  float64
 12  color_G            40455 non-null  float64
 13  color_H            40455 non-null  float64
 14  color_I            40455 non-null  float64
 15  color_J            40455 non-null  float64
 16  clarity_SI         404

## ⚠️⚠️ DECISION

In [None]:
# Vamos a escalar los datos utilizando Robust Scaler
# ¿Por qué? Es la técnica más robusta frente a valores atípicos y no se ve tan afectada por valores extremos 
# Como price tiene valores outliers, creo que Robust Scaler es the best option

In [6]:
# Antes de escalar, hacemos el separado de features y target
X = diamonds_dummy.drop('price', axis=1)
y = diamonds_dummy['price']

In [7]:
print(X.shape)
print(y.shape)

(40455, 21)
(40455,)


In [8]:
robust_scaler = RobustScaler()
robust_diamonds = robust_scaler.fit_transform(X)
robust_diamonds

array([[ 0.796875  ,  0.33333333,  0.62295082, ...,  0.        ,
         1.        ,  0.        ],
       [-0.59375   ,  0.        , -0.73224044, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.015625  , -0.66666667, -0.03825137, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.484375  , -0.33333333,  0.3715847 , ...,  0.        ,
         0.        ,  0.        ],
       [-0.578125  , -0.9       , -0.67759563, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.84375   ,  0.33333333,  0.62295082, ...,  0.        ,
         0.        ,  1.        ]])

## ⚠️⚠️ DECISION

In [9]:
# Train, test y split por el 80% de los datos

In [12]:
# Train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

X_train: (32364, 21), X_test: (8091, 21), y_train: (32364,), y_test: (8091,)


In [35]:
type(X_train)

pandas.core.frame.DataFrame

## ⚠️⚠️ DECISION

In [13]:
# Primera prueba con LinearRegresion

In [14]:
model = LinearRegression()

In [15]:
model.fit(X_train, y_train)

In [16]:
predict_LR_robust = model.predict(X_test)

In [17]:
predict_LR_robust

array([3435.19033324, 3021.87063654, 1319.1964384 , ..., 2965.52086457,
       4133.05064027, 9389.74553527])

In [18]:
predict_LR_robust.shape

(8091,)

In [19]:
check_LR_robust = pd.DataFrame({'Ground truth':y_test, 'Predictions':predict_LR_robust, 'Diff':y_test-predict_LR_robust})
check_LR_robust

Unnamed: 0,Ground truth,Predictions,Diff
17775,2970,3435.190333,-465.190333
13506,3004,3021.870637,-17.870637
4325,838,1319.196438,-481.196438
37870,6468,6108.881824,359.118176
21321,633,952.942107,-319.942107
...,...,...,...
3781,4764,5145.784548,-381.784548
26959,756,26.223343,729.776657
15529,2690,2965.520865,-275.520865
36333,3992,4133.050640,-141.050640


In [23]:
# RMSE calculation

rmse_LR_robuts = mean_squared_error(y_test, predict_LR_robust, squared=False)
rmse_LR_robuts



1171.0044076499253

# Predict sobre diamonds_test

In [24]:
diamonds_test = pd.read_csv('./data/diamonds_test.csv')
diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [25]:
diamonds_test = diamonds_test.drop('depth', axis=1)

In [26]:
diamonds_test = diamonds_test.drop('id', axis=1)

In [27]:
city_map = {
    'Dubai': 'Asia',
    'Kimberly': 'Africa',
    'Las Vegas': 'America',
    'Tel Aviv': 'Asia',
    'Amsterdam': 'Europe',
    'Zurich': 'Europe',
    'Antwerp': 'Europe',
    'Madrid': 'Europe',
    'Paris': 'Europe',
    'Surat': 'Asia',
    'Luxembourg': 'Europe',
    'London': 'Europe',
    'New York City': 'America'
}

In [28]:
diamonds_test['continent'] = diamonds_test['city'].map(city_map)

In [29]:
diamonds_test = diamonds_test.drop('city', axis=1)

In [30]:
diamonds_test['clarity'] = diamonds_test['clarity'].replace(['IF'], ['VVSI']).replace(['VVS1'], ['VVSI']).replace(['VVS2'], ['VVSI'])
diamonds_test['clarity'] = diamonds_test['clarity'].replace(['VS1'], ['VSI']).replace(['VS2'], ['VSI'])
diamonds_test['clarity'] = diamonds_test['clarity'].replace(['SI1'], ['SI']).replace(['SI2'], ['SI'])


In [31]:
diamonds_test_dummy = pd.get_dummies(diamonds_test, drop_first=True, dtype=float)

In [32]:
diamonds_test_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   carat              13485 non-null  float64
 1   table              13485 non-null  float64
 2   x                  13485 non-null  float64
 3   y                  13485 non-null  float64
 4   z                  13485 non-null  float64
 5   cut_Good           13485 non-null  float64
 6   cut_Ideal          13485 non-null  float64
 7   cut_Premium        13485 non-null  float64
 8   cut_Very Good      13485 non-null  float64
 9   color_E            13485 non-null  float64
 10  color_F            13485 non-null  float64
 11  color_G            13485 non-null  float64
 12  color_H            13485 non-null  float64
 13  color_I            13485 non-null  float64
 14  color_J            13485 non-null  float64
 15  clarity_SI         13485 non-null  float64
 16  clarity_VSI        134

In [44]:
print(diamonds_test_dummy.shape)
print(X_train.shape)

(13485, 21)
(32364, 21)


In [43]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32364 entries, 32121 to 15795
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   carat              32364 non-null  float64
 1   table              32364 non-null  float64
 2   x                  32364 non-null  float64
 3   y                  32364 non-null  float64
 4   z                  32364 non-null  float64
 5   cut_Good           32364 non-null  float64
 6   cut_Ideal          32364 non-null  float64
 7   cut_Premium        32364 non-null  float64
 8   cut_Very Good      32364 non-null  float64
 9   color_E            32364 non-null  float64
 10  color_F            32364 non-null  float64
 11  color_G            32364 non-null  float64
 12  color_H            32364 non-null  float64
 13  color_I            32364 non-null  float64
 14  color_J            32364 non-null  float64
 15  clarity_SI         32364 non-null  float64
 16  clarity_VSI        3236

In [53]:
col_names = X_train.columns.values

In [54]:
col_names_lista = list(col_names)

In [50]:
robust_scaler = RobustScaler()
robust_diamonds_test = robust_scaler.fit_transform(diamonds_test_dummy)
robust_diamonds_test

array([[ 0.140625  ,  1.        ,  0.06666667, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78125   ,  0.        ,  0.61666667, ...,  0.        ,
         1.        , -1.        ],
       [ 1.359375  ,  1.33333333,  0.93333333, ...,  0.        ,
         0.        , -1.        ],
       ...,
       [ 0.        , -0.66666667,  0.02777778, ...,  0.        ,
         1.        , -1.        ],
       [ 0.        ,  0.        ,  0.08333333, ...,  0.        ,
         1.        , -1.        ],
       [-0.46875   , -0.66666667, -0.55555556, ...,  1.        ,
         0.        , -1.        ]])

In [51]:
predict_LR_robust = model.predict(robust_diamonds_test)



In [55]:
scaled_df_test = pd.DataFrame(robust_diamonds_test, columns=col_names_lista)
scaled_df_test.head()

Unnamed: 0,carat,table,x,y,z,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_E,...,color_G,color_H,color_I,color_J,clarity_SI,clarity_VSI,clarity_VVSI,continent_America,continent_Asia,continent_Europe
0,0.140625,1.0,0.066667,0.094444,0.125,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.78125,0.0,0.616667,0.65,0.580357,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,-1.0
2,1.359375,1.333333,0.933333,0.888889,0.928571,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0
3,0.3125,-1.0,0.216667,0.227778,0.330357,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0
4,-0.3125,0.333333,-0.361111,-0.35,-0.303571,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [56]:
predict_LR_robust = model.predict(scaled_df_test)

In [58]:
predict_LR_robust.shape

(13485,)

In [59]:
predictions_df = pd.DataFrame(predict_LR_robust, columns=['price'])

In [67]:
predictions_df['id'] = range(0, len(predict_LR_robust))

In [68]:
predictions_df = predictions_df[['id', 'price']]

In [69]:
predictions_df

Unnamed: 0,id,price
0,0,2704.338228
1,1,8361.677958
2,2,14430.831795
3,3,4420.900356
4,4,-661.373426
...,...,...
13480,13480,-541.298304
13481,13481,1549.858786
13482,13482,2480.548798
13483,13483,1234.612914


In [70]:
predictions_df.to_csv('./submissions/submission_LR_RS.csv', index=False)