In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
diamond_df = pd.read_csv('diamonds.csv', index_col=0)

In [3]:
diamond_df.shape

(53940, 10)

In [4]:
diamond_df.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [5]:
diamond_df.cut.value_counts()

Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: cut, dtype: int64

In [6]:
# Encode the ordinal categorical variable 'cut'
cut_mapping = {'Fair': 0, 'Good': 1, 'Very Good': 2, 'Premium': 3, 'Ideal': 4}
diamond_df.cut = diamond_df.cut.map(cut_mapping)

In [7]:
diamond_df.color.value_counts()

G    11292
E     9797
F     9542
H     8304
D     6775
I     5422
J     2808
Name: color, dtype: int64

In [8]:
# Encoding the ordinal categorical variable 'color'
color_mapping = {'J': 0, 'I': 1, 'H': 2, 'G': 3, 'F': 4, 'E': 5, 'D': 6}
diamond_df.color = diamond_df.color.map(color_mapping)

In [9]:
diamond_df.clarity.value_counts()

SI1     13065
VS2     12258
SI2      9194
VS1      8171
VVS2     5066
VVS1     3655
IF       1790
I1        741
Name: clarity, dtype: int64

In [10]:
# Encoding the ordinal cateogircal variable 'clarity'
clarity_mapping = {'I1': 0, 'SI2': 1, 'SI1': 2, 'VS2': 3, 'VS1': 4, 'VVS2': 5, 'VVS1': 6, 'IF': 7}
diamond_df.clarity = diamond_df.clarity.map(clarity_mapping)

In [11]:
diamond_df.describe()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,2.904097,3.405803,3.05102,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.1166,1.701105,1.647136,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,0.0,0.0,0.0,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,2.0,2.0,2.0,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,3.0,3.0,3.0,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,4.0,5.0,4.0,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,4.0,6.0,7.0,79.0,95.0,18823.0,10.74,58.9,31.8


In [12]:
diamond_df = diamond_df.drop(diamond_df[diamond_df["x"]==0].index)
diamond_df = diamond_df.drop(diamond_df[diamond_df["y"]==0].index)
diamond_df = diamond_df.drop(diamond_df[diamond_df["z"]==0].index)

In [13]:
diamond_df = diamond_df[diamond_df['depth'] < diamond_df['depth'].quantile(0.99)]
diamond_df = diamond_df[diamond_df['table'] < diamond_df['table'].quantile(0.99)]
diamond_df = diamond_df[diamond_df['x'] < diamond_df['x'].quantile(0.99)]
diamond_df = diamond_df[diamond_df['y'] < diamond_df['y'].quantile(0.99)]
diamond_df = diamond_df[diamond_df['z'] < diamond_df['z'].quantile(0.99)]

In [14]:
model_df = diamond_df.copy()

In [15]:
X = model_df.drop(['price'], axis=1)
y = model_df['price']

In [16]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=0)

In [27]:
rf1 = RandomForestRegressor()
parameters =  {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [28]:
rf_grid = GridSearchCV(rf1,
                        parameters,
                        cv = 4,
                        n_jobs = -1,
                        verbose=0)

In [29]:
X_train.dtypes

carat      float64
cut          int64
color        int64
clarity      int64
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

In [30]:
rf_grid.fit(X_train, y_train)



In [36]:
print(rf_grid.best_score_)
print(rf_grid.best_params_)

0.9823581298205339
{'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}


In [37]:
rf_cv = (rf_grid.best_estimator_)

In [38]:
rf_cv

In [39]:
eval_set = [(X_val, y_val)]

In [41]:
help(rf_cv.fit)

Help on method fit in module sklearn.ensemble._forest:

fit(X, y, sample_weight=None) method of sklearn.ensemble._forest.RandomForestRegressor instance
    Build a forest of trees from the training set (X, y).
    
    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        The training input samples. Internally, its dtype will be converted
        to ``dtype=np.float32``. If a sparse matrix is provided, it will be
        converted into a sparse ``csc_matrix``.
    
    y : array-like of shape (n_samples,) or (n_samples, n_outputs)
        The target values (class labels in classification, real numbers in
        regression).
    
    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights. If None, then samples are equally weighted. Splits
        that would create child nodes with net zero or negative weight are
        ignored while searching for a split in each node. In the case of
        classification,

In [43]:
fit_model = rf_cv.fit(
    X_train,
    y_train)

In [44]:
print("MAE:", mean_absolute_error(y_val, fit_model.predict(X_val)))
print("MSE:", mean_squared_error(y_val, fit_model.predict(X_val)))
print("R2:", r2_score(y_val, fit_model.predict(X_val)))

MAE: 233.43331910778994
MSE: 214945.40723804868
R2: 0.982681674042247


In [45]:
print("MAE:", mean_absolute_error(y_test, fit_model.predict(X_test)))
print("MSE:", mean_squared_error(y_test, fit_model.predict(X_test)))
print("R2:", r2_score(y_test, fit_model.predict(X_test)))

MAE: 225.11607832712357
MSE: 197134.45047830054
R2: 0.983453290972985


In [49]:
import pickle

# Assuming you have a trained model object called 'model'
model_file_path = 'model.pkl'

# Save the model using pickle
with open(model_file_path, 'wb') as file:
    pickle.dump(fit_model, file)

print("Model saved successfully as a pickle file.")

Model saved successfully as a pickle file.
