In [1]:
# Importing the dependencies
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from configuration.configure import MongoDBConfig
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer        # Used to handle missing data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [2]:
import pymongo
import pymongo.mongo_client

# Establish a connection to MongoDB
client = pymongo.MongoClient("mongodb+srv://saurabh0903:S%40urabh0903@airfare.dgwz42b.mongodb.net/")  # Replace with your MongoDB connection string
db = client["gemstone_data"]  # Replace with your database name
collection = db["raw_data"]  # Replace with your collection name

# Query the collection and fetch all data
mongo_data = collection.find({})

# Convert MongoDB data to Pandas DataFrame
data = pd.DataFrame(list(mongo_data))
pd.set_option('display.max_columns', None)

In [3]:
data = data.drop(columns="_id", axis=1)
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
data.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')

In [5]:
# Sperating the features and columns

x = data.drop(columns=['id','price'], axis=1)
y = data['price']

In [6]:
print(x)

        carat        cut color clarity  depth  table     x     y     z
0        1.52    Premium     F     VS2   62.2   58.0  7.27  7.33  4.55
1        2.03  Very Good     J     SI2   62.0   58.0  8.06  8.12  5.05
2        0.70      Ideal     G     VS1   61.2   57.0  5.69  5.73  3.50
3        0.32      Ideal     G     VS1   61.6   56.0  4.38  4.41  2.71
4        1.70    Premium     G     VS2   62.6   59.0  7.65  7.61  4.77
...       ...        ...   ...     ...    ...    ...   ...   ...   ...
193568   0.31      Ideal     D    VVS2   61.1   56.0  4.35  4.39  2.67
193569   0.70    Premium     G    VVS2   60.3   58.0  5.75  5.77  3.47
193570   0.73  Very Good     F     SI1   63.1   57.0  5.72  5.75  3.62
193571   0.34  Very Good     D     SI1   62.9   55.0  4.45  4.49  2.81
193572   0.71       Good     E     SI2   60.8   64.0  5.73  5.71  3.48

[193573 rows x 9 columns]


In [7]:
print(y)

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569     2874
193570     3036
193571      681
193572     2258
Name: price, Length: 193573, dtype: int64


In [8]:
# Seperating the categorical and numerical 
cat_col = x.select_dtypes(include='object').columns
num_col = x.select_dtypes(exclude='object').columns

print(f"Categorical Columns : {cat_col}  \nNumerical Columns: {num_col}")

Categorical Columns : Index(['cut', 'color', 'clarity'], dtype='object')  
Numerical Columns: Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


#### Constructing the pipelines


In [9]:
# Constructing the numerical pipelines

num_pipeline = Pipeline(
                        steps=[
                                ("imputer", SimpleImputer()),
                                ('scaler', StandardScaler())
                            ]
                        )

In [10]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

cat_features = [cut_categories,color_categories,clarity_categories]

In [11]:
# Constructing the categorical pipelines

cat_pipeline = Pipeline(
                            steps=[
                                    ("imputer", SimpleImputer(strategy="most_frequent")),
                                    ("ordernal", OrdinalEncoder(categories=cat_features))
                                ]
                        )

In [12]:
# Fitting both the pipeline in one.

preprocessor = ColumnTransformer(
                                    [
                                        ("num_pipelin", num_pipeline, num_col),
                                        ("cat_pipeline", cat_pipeline, cat_col)
                                    ]
                                )

In [13]:
# Splitting the data into train test split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [14]:
x_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
11504,0.41,Ideal,E,VVS2,60.6,56.0,4.85,4.8,2.93
95284,1.23,Very Good,H,VS1,59.9,59.0,6.91,7.01,4.19
184777,1.7,Premium,H,VS2,62.0,58.0,7.61,7.66,4.74
5419,0.33,Ideal,F,VVS1,61.2,56.0,4.47,4.44,2.73
45466,0.33,Very Good,I,SI1,62.1,58.0,4.41,4.45,2.75


In [15]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(135501, 9) (58072, 9) (135501,) (58072,)


In [16]:
preprocessor.fit_transform(x_train)

array([[-0.82314374, -1.12998781, -0.64189666, ...,  4.        ,
         1.        ,  5.        ],
       [ 0.94502267, -1.77782269,  0.92190185, ...,  2.        ,
         4.        ,  4.        ],
       [ 1.9584839 ,  0.16568195,  0.40063568, ...,  3.        ,
         4.        ,  3.        ],
       ...,
       [ 0.92345966,  0.90606467,  0.40063568, ...,  3.        ,
         3.        ,  3.        ],
       [-1.03877378, -0.66724861, -0.64189666, ...,  2.        ,
         3.        ,  6.        ],
       [-1.03877378, -0.01941373,  0.92190185, ...,  2.        ,
         3.        ,  1.        ]])

In [17]:
preprocessor.transform(x_test)

array([[-0.62907669,  0.25822979, -0.12063049, ...,  4.        ,
         0.        ,  2.        ],
       [ 2.60537405, -2.14801405, -0.12063049, ...,  2.        ,
         3.        ,  1.        ],
       [-1.1250258 , -1.22253565,  0.92190185, ...,  3.        ,
         1.        ,  6.        ],
       ...,
       [-0.82314374, -0.01941373, -0.64189666, ...,  4.        ,
         3.        ,  6.        ],
       [ 0.90189666, -0.66724861,  1.44316802, ...,  3.        ,
         5.        ,  2.        ],
       [ 0.47063656,  0.90606467, -0.64189666, ...,  2.        ,
         1.        ,  3.        ]])

In [18]:
preprocessor.get_feature_names_out()

array(['num_pipelin__carat', 'num_pipelin__depth', 'num_pipelin__table',
       'num_pipelin__x', 'num_pipelin__y', 'num_pipelin__z',
       'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [19]:
x_train.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [20]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns=preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.fit_transform(x_test), columns=preprocessor.get_feature_names_out())

### Model Training

In [21]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "ElasticNet": ElasticNet(),
    "RandomForestRegressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor()
}

In [23]:
list(models)

['LinearRegression',
 'Lasso',
 'Ridge',
 'ElasticNet',
 'RandomForestRegressor',
 'XGBRegressor']

In [22]:
trained_model_list = []
model_list = []
r2_list = []

In [24]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    print(model)

LinearRegression()
Lasso()
Ridge()
ElasticNet()
RandomForestRegressor()
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)


In [25]:
models.keys()

dict_keys(['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet', 'RandomForestRegressor', 'XGBRegressor'])

In [27]:
list(models.values())[1]


In [28]:
# Defining the Evalauation metrics

def evaluate_model(true, pred):
    r2=r2_score(true,pred)
    mae=mean_absolute_error(true,pred)
    mse=mean_squared_error(true,pred)

    return mae, mse,r2

In [29]:
for i in range(len(list(models))):
    model = list(models.values())[i]

    model.fit(x_train, y_train)

    # Making the prediction 
    y_pred = model.predict(x_test)

    # Validating the model
    MAE, MSE, R2 = evaluate_model(y_test, y_pred)

    print("model training performance",model)
    print("MSE:", MSE)
    print("MAE:",MAE)
    print("R2 SCORE:",R2)

    r2_list.append(R2)

    print("="*40)
    print("\n")

model training performance LinearRegression()
MSE: 1030893.6384405316
MAE: 679.769038171523
R2 SCORE: 0.936202787146243


model training performance Lasso()
MSE: 1030853.1061213679
MAE: 681.1898158587021
R2 SCORE: 0.9362052955029704


model training performance Ridge()
MSE: 1030898.2244225062
MAE: 679.8037450966742
R2 SCORE: 0.9362025033411469


model training performance ElasticNet()
MSE: 2268663.327280299
MAE: 1056.3975146157109
R2 SCORE: 0.8596029776622169


model training performance RandomForestRegressor()
MSE: 372898.1866954922
MAE: 310.573328746526
R2 SCORE: 0.9769230654819249


model training performance XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None