In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv(r"C:\Users\acer\Documents\project1_machine learning end to end\notebooks\data\gemstone.csv")

In [3]:
data.drop(labels=["id"],axis=1,inplace=True)

In [4]:
data

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [5]:
X = data.drop(["price"],axis=1)
y = data[["price"]]

In [6]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [7]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [8]:
numerical_cols = X.select_dtypes(exclude="object").columns
categorical_cols = X.select_dtypes(include="object").columns

In [9]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [10]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [11]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']



In [12]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [13]:
numerical_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer()),
        ("Scaler",StandardScaler())
    ]
)
categorical_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("Ordinal encoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
    ]
)

In [14]:
preprocessor = ColumnTransformer(
    [
    ("numerical_pipeline",numerical_pipeline,numerical_cols),
    ("categorical_pipeline",categorical_pipeline,categorical_cols)
    ]

)

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=30)

In [17]:
preprocessor.fit_transform(X_train)

array([[-0.54220555,  2.11279474, -1.68414474, ...,  1.        ,
         1.        ,  4.        ],
       [-0.15318745,  0.44710281, -1.16300271, ...,  4.        ,
         2.        ,  1.        ],
       [ 1.57578188, -2.14397354,  0.92156539, ...,  2.        ,
         2.        ,  2.        ],
       ...,
       [ 0.45195182,  1.5575641 , -0.64186069, ...,  1.        ,
         3.        ,  2.        ],
       [ 0.66807298, -1.77381977,  1.44270741, ...,  4.        ,
         3.        ,  4.        ],
       [ 0.25744277,  0.81725657, -0.12071866, ...,  4.        ,
         3.        ,  2.        ]])

In [18]:
preprocessor.transform(X_test)

array([[-0.56381767, -0.94097381, -0.64186069, ...,  3.        ,
         1.        ,  3.        ],
       [-0.17479957,  1.00233345, -0.12071866, ...,  2.        ,
         4.        ,  2.        ],
       [-1.06089635,  0.26202593, -0.12071866, ...,  4.        ,
         4.        ,  7.        ],
       ...,
       [-0.99606   , -1.12605069, -0.64186069, ...,  4.        ,
         3.        ,  7.        ],
       [-0.54220555, -0.38574316, -0.64186069, ...,  4.        ,
         2.        ,  4.        ],
       [-1.06089635,  0.07694904, -0.64186069, ...,  4.        ,
         2.        ,  2.        ]])

In [19]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

In [20]:
X_train

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-0.542206,2.112795,-1.684145,-0.455961,-0.544885,-0.354514,1.0,1.0,4.0
1,-0.153187,0.447103,-1.163003,0.021706,0.053798,0.080569,4.0,2.0,1.0
2,1.575782,-2.143974,0.921565,1.652982,1.595862,1.356812,2.0,2.0,2.0
3,1.683842,0.447103,-0.641861,1.562856,1.550507,1.588856,4.0,5.0,7.0
4,-0.844775,1.094872,0.400423,-0.987703,-0.962150,-0.876613,3.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...
154853,-1.039284,-0.015589,-0.641861,-1.267093,-1.243350,-1.239182,4.0,1.0,2.0
154854,0.992255,0.169487,-0.641861,1.049139,1.115101,1.081259,4.0,3.0,1.0
154855,0.451952,1.557564,-0.641861,0.517397,0.588985,0.704187,1.0,3.0,2.0
154856,0.668073,-1.773820,1.442707,0.868888,0.951823,0.689685,4.0,3.0,4.0


In [21]:
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [22]:
X_test

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-0.563818,-0.940974,-0.641861,-0.428923,-0.463247,-0.499542,3.0,1.0,3.0
1,-0.174800,1.002333,-0.120719,-0.041382,-0.027841,0.037061,2.0,4.0,2.0
2,-1.060896,0.262026,-0.120719,-1.303143,-1.297775,-1.268188,4.0,4.0,7.0
3,0.970643,-0.200666,1.963849,1.049139,0.997178,0.979740,3.0,3.0,3.0
4,-0.931224,-1.311128,0.400423,-1.005728,-0.989363,-1.065149,3.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...
38710,-0.823163,-0.385743,-0.120719,-0.861527,-0.844227,-0.876613,4.0,2.0,3.0
38711,-1.039284,0.169487,-0.641861,-1.258080,-1.243350,-1.239182,4.0,1.0,5.0
38712,-0.996060,-1.126051,-0.641861,-1.086841,-1.071001,-1.152166,4.0,3.0,7.0
38713,-0.542206,-0.385743,-0.641861,-0.401886,-0.426963,-0.441530,4.0,2.0,4.0


In [23]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn. metrics import mean_squared_error,mean_absolute_error,r2_score

In [33]:
def evaluate(test,pred):
    mae = mean_absolute_error(test,pred)
    mse = mean_squared_error(test,pred)
    rmse = np.sqrt(mse)
    r2_value = r2_score(test,pred)
    return mae, mse, rmse,r2_value

In [38]:
models = {
    "Linear Regression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "ElasticNet":ElasticNet()

}
r2_list = []

for key,value in models.items():
    model = value
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    
    mae, mse, rmse, r2_value = evaluate(y_test,y_pred)

    print(key)

    print("mse: {}".format(mse))
    print("rmse: {}".format(rmse))
    print("mae: {}".format(mae))
    print("r2score: {}".format(r2_value*100))
    r2_list.append(r2_value*100)

    
    print("="*35)
    print("\n")


    
    

Linear Regression
mse: 1031799.3901275055
rmse: 1015.7752655619773
mae: 675.5483623703055
r2score: 93.67464279030936


Ridge
mse: 1031800.5545918676
rmse: 1015.7758387517729
mae: 675.5753135470308
r2score: 93.67463565166098


Lasso
mse: 1032115.120178232
rmse: 1015.9306670133706
mae: 676.8315267472232
r2score: 93.67270723445247


ElasticNet
mse: 2310456.6923467587
rmse: 1520.0186486838768
mae: 1053.9094724261593
r2score: 85.83594443217535




In [26]:
list(models)[0]

'Linear Regression'

In [30]:
for i,k in models.items():
    print(i,k)

Linear Regression LinearRegression()
Ridge Ridge()
Lasso Lasso()
ElasticNet ElasticNet()


In [39]:
r2_list

[93.67464279030936, 93.67463565166098, 93.67270723445247, 85.83594443217535]