In [3]:
import pandas as pd

##Model_Training

In [6]:
df = pd.read_csv('data/gemstone-1.csv')

In [7]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [11]:
#independent and dependent feature
x = df.drop(labels='price',axis = 1)
Y = df[['price']]

In [12]:
x.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


In [21]:
#define which columns should be ordinal encoded and which should be scaled
categorical_cols = x.select_dtypes(include = 'object').columns
numerical_cols = x.select_dtypes(exclude=  'object').columns

In [16]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [23]:
cut_category = ['Fair',  'Good','Very Good',  'Premium','Ideal']

color_category=['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_category = [  'I1','SI2', 'SI1','VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [27]:
from sklearn.impute import SimpleImputer #handeling missing values
from sklearn.preprocessing import StandardScaler # Feature Scaling
from sklearn.preprocessing import OrdinalEncoder #ordinal encoding
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [36]:
#Numerical Pipeline

num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ]



)

# Categorical pipeline

Cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy = "most_frequent")),
        ('Ordinalencoder', OrdinalEncoder(categories=[cut_category,color_category,clarity_category])),
        ('scaler', StandardScaler())]
)

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('Cat_pipeline',Cat_pipeline,categorical_cols)
])

In [44]:
#Train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size=0.3,random_state=42)

In [45]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train),columns = preprocessor.get_feature_names_out())

In [46]:
x_test=pd.DataFrame(preprocessor.transform(x_test),columns = preprocessor.get_feature_names_out())

In [47]:
x_test.head()

Unnamed: 0,num_pipeline__id,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,Cat_pipeline__cut,Cat_pipeline__color,Cat_pipeline__clarity
0,1.277216,2.411436,-2.126742,0.398674,2.007308,2.043364,1.795557,-0.134983,1.53873,-0.653182
1,-1.280025,2.605482,-5.343458,-0.641391,2.45677,2.504776,1.868235,-3.148256,2.156395,-1.981957
2,-1.457516,0.880626,0.079006,-0.121358,0.982535,0.921499,0.952487,0.869441,-0.314265,0.675593
3,-1.028845,-0.995155,-0.288618,-0.641391,-1.129936,-1.159378,-1.155187,0.869441,-0.93193,-0.653182
4,-1.472736,0.492533,0.906162,0.398674,0.640944,0.595797,0.705381,-0.134983,0.3034,0.011206


In [49]:
x_train.head()

Unnamed: 0,num_pipeline__id,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,Cat_pipeline__cut,Cat_pipeline__color,Cat_pipeline__clarity
0,0.233297,-1.059837,-0.564337,0.398674,-1.210839,-1.240804,-1.271472,-0.134983,-1.549595,0.011206
1,0.707955,0.535655,0.354725,-1.161423,0.68589,0.731506,0.748988,0.869441,0.3034,0.675593
2,0.250268,-1.038276,-0.104806,-0.641391,-1.255785,-1.249851,-1.256937,0.869441,-0.93193,0.675593
3,1.276231,-0.607062,0.72235,-0.121358,-0.590581,-0.562257,-0.530153,0.869441,1.53873,0.675593
4,0.653755,-0.973594,-0.472431,-0.121358,-1.093979,-1.077953,-1.11158,-2.143832,-1.549595,0.011206


In [50]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [51]:
regression=LinearRegression()
regression.fit(x_train,y_train)

In [52]:
regression.coef_

array([[-5.74170125e+00,  6.46959422e+03, -8.64760819e+01,
        -4.55276729e+01, -1.86642611e+03,  2.18324009e+01,
        -4.55678410e+02,  8.26700193e+01, -4.64373664e+02,
         6.52217734e+02]])

In [54]:
regression.intercept_

array([3987.7517522])

In [55]:
import numpy as np
def evaluate_model(true,predicted):
    mae = mean_absolute_error (true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,rmse,r2_square

In [68]:
models={
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "ElasticNet":ElasticNet()
    }
trained_model_list =[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)
    ##make prediction

    y_pred = model.predict(x_test)

    mae,rmse,r2_square = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])

    model_list.append(list(models.keys())[i])
    # trained_model_list.append(list(models.values())[i])

    print("Model Training Performance")
    print("RMSE",rmse)
    print("MAE",mae)
    print("R2 score", r2_square*100, 'percent')
    r2_list.append(r2_square)

    print('-'*34)
    print('\n')

LinearRegression
Model Training Performance
RMSE 1054.875701103809
MAE 688.5690220267634
R2 score 93.49974530933548 percent
----------------------------------


Lasso
Model Training Performance
RMSE 1054.8219817286563
MAE 689.411604492358
R2 score 93.50040734132415 percent
----------------------------------


Ridge
Model Training Performance
RMSE 1054.8432752159686
MAE 688.6980411497003
R2 score 93.50014492663594 percent
----------------------------------


ElasticNet
Model Training Performance
RMSE 1570.633316238913
MAE 1083.1324066153338
R2 score 85.58955450265272 percent
----------------------------------


