In [1]:
import pandas as pd

# MODEL TRAINING

In [2]:
df = pd.read_csv("/config/workspace/notebook/data/gemstone.csv")

In [3]:
df.shape

(193573, 11)

In [4]:
df = df.drop(labels=['id'],axis = 1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
#inependent & dependent variable
x = df.drop(labels=['price'],axis = 1)
y = df[['price']]

In [6]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [7]:
x

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [8]:
# Data segrigation
categorical_col = x.select_dtypes(include='object').columns
numerical_col  = x.select_dtypes(exclude='object').columns

In [9]:
categorical_col

Index(['cut', 'color', 'clarity'], dtype='object')

In [10]:
numerical_col

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [11]:
df['clarity'].unique()

array(['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1'],
      dtype=object)

In [12]:
#define the custom ranking for each variable
cut_categories = ['Fair','Good', 'Very Good', 'Premium', 'Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1', 'VS2','VS1','VVS2','VVS1','IF']

In [13]:
from sklearn.impute import SimpleImputer ## handling missing values
from sklearn.preprocessing import StandardScaler ## Handling Feature scaling
from sklearn.preprocessing import OrdinalEncoder # ordinal encording

## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [14]:
## AUTOMATION
# Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
    ('inputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)
# Categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalEncoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_col),
    ('cat_pipeline',cat_pipeline,categorical_col)
])

In [15]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.33,random_state = 30)

In [16]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns=preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.transform(x_test), columns=preprocessor.get_feature_names_out())

In [17]:
x_train.head(4)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.190903,0.538078,-0.121487,-1.573815,-1.57007,-1.527658,0.873771,-0.934593,1.352685
1,0.904529,-0.479492,-0.642431,1.03915,1.04146,0.97729,0.873771,0.297424,2.019062
2,-1.039686,-2.144607,0.399457,-1.195386,-1.189222,-1.339424,-2.143965,-0.318584,0.686308
3,-0.845265,-0.664505,1.441345,-0.880028,-0.880916,-0.91952,-0.132141,0.297424,0.019931


In [18]:
## Model Training
from sklearn.linear_model import LinearRegression, Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error

In [19]:
regrassion = LinearRegression()
regrassion.fit(x_train,y_train)

In [20]:
regrassion.coef_

array([[ 6432.37246544,  -133.37085491,   -71.5594995 , -1712.75940631,
         -509.03110192,   -56.40910864,    72.29448314,  -459.25648103,
          650.78049561]])

In [21]:
regrassion.intercept_

array([3971.54247338])

In [22]:
import numpy as np 
def evaluate_model(true,predict):
    mae = mean_absolute_error(true,predict)
    rmse = np.sqrt(mean_squared_error(true,predict))
    r2_square = r2_score(true,predict)
    return mae,rmse,r2_score


In [23]:
## Train multiple models
## Model Ecaluation
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)

    #Make Predictions
    y_pred=model.predict(x_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2_score:",r2_square)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1012.0684598433741
MAE: 673.1192939264374
R2_score: <function r2_score at 0x7efdfff92310>


Lasso
Model Training Performance
RMSE: 1012.073864691222
MAE: 674.1819437208048
R2_score: <function r2_score at 0x7efdfff92310>


Ridge
Model Training Performance
RMSE: 1012.0707696505232
MAE: 673.1514706239293
R2_score: <function r2_score at 0x7efdfff92310>


Elasticnet
Model Training Performance
RMSE: 1531.3858261707658
MAE: 1059.854688482479
R2_score: <function r2_score at 0x7efdfff92310>


