In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./data/train.csv")
data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
data = data.drop("id", axis=1)

In [4]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
data.shape

(193573, 10)

In [6]:
X = data.iloc[:,:-1]
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [7]:
y = data.iloc[:,-1]
y

0         13619
1         13387
2          2772
3           666
4         14453
          ...  
193568     1130
193569     2874
193570     3036
193571      681
193572     2258
Name: price, Length: 193573, dtype: int64

In [8]:
categorical_X_features = X.select_dtypes(include="object").columns
numerical_X_features = X.select_dtypes(exclude="object").columns

In [9]:
from sklearn.impute import SimpleImputer # handling missing values
from sklearn.preprocessing import StandardScaler # To scale features
from sklearn.preprocessing import OrdinalEncoder # To encode categorical features

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
cut_categories = ["Fair","Good","Very Good","Premium", "Ideal"]
color_categories = ["D", "E", "F", "G", "H", "I", "J"]
clarity_categories = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"] 

In [11]:
numerical_pipeline = Pipeline(

        steps = [
            ("Imputer", SimpleImputer()),
            ("scaler", StandardScaler())
        ]
)

In [12]:
categorical_pipeline = Pipeline(

        steps = [
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ordinalencoder", OrdinalEncoder(categories = [cut_categories, color_categories, clarity_categories]))
        ]
)

In [13]:
preprocessor = ColumnTransformer(
    [
        ("numerical_pipeline", numerical_pipeline, numerical_X_features),
        ("categorical_pipeline", categorical_pipeline, categorical_X_features)
    ]
)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

In [20]:
X_train = preprocessor.fit_transform(X_train)
X_train

array([[-1.06044536,  0.90546103, -0.63974199, ...,  4.        ,
         1.        ,  6.        ],
       [-0.15298696,  0.07463378,  0.92396069, ...,  3.        ,
         2.        ,  2.        ],
       [ 0.45198531,  1.92091656, -0.63974199, ...,  1.        ,
         5.        ,  2.        ],
       ...,
       [-0.58511   , -1.31007831,  3.53013183, ...,  1.        ,
         1.        ,  2.        ],
       [-0.47707924,  0.44389034, -1.68221045, ...,  4.        ,
         5.        ,  4.        ],
       [ 0.71125914, -0.20230864, -0.63974199, ...,  4.        ,
         2.        ,  6.        ]])

In [21]:
X_test = preprocessor.transform(X_test)
X_test

array([[-0.15298696, -0.01768036,  1.96642915, ...,  1.        ,
         1.        ,  4.        ],
       [-0.82277768, -1.12545003, -0.11850776, ...,  2.        ,
         1.        ,  4.        ],
       [-0.86598999, -0.38693692, -1.16097622, ...,  4.        ,
         3.        ,  5.        ],
       ...,
       [ 1.74835445,  0.3515762 , -0.11850776, ...,  3.        ,
         5.        ,  2.        ],
       [ 0.47359146,  0.44389034,  0.92396069, ...,  3.        ,
         1.        ,  3.        ],
       [-0.19619926, -1.49470659,  3.00889761, ...,  2.        ,
         3.        ,  1.        ]])

In [18]:
preprocessor.get_feature_names_out()

array(['numerical_pipeline__carat', 'numerical_pipeline__depth',
       'numerical_pipeline__table', 'numerical_pipeline__x',
       'numerical_pipeline__y', 'numerical_pipeline__z',
       'categorical_pipeline__cut', 'categorical_pipeline__color',
       'categorical_pipeline__clarity'], dtype=object)

In [22]:
X_train = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())

In [23]:
X_train

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-1.060445,0.905461,-0.639742,-1.312068,-1.306707,-1.239040,4.0,1.0,6.0
1,-0.152987,0.074634,0.923961,0.048945,0.063027,0.051505,3.0,2.0,2.0
2,0.451985,1.920917,-0.639742,0.544678,0.589150,0.762029,1.0,5.0,2.0
3,-0.606716,1.367032,-1.160976,-0.591001,-0.562878,-0.456013,1.0,5.0,2.0
4,-0.174593,-2.048591,2.487663,0.012892,-0.063968,-0.209504,3.0,3.0,4.0
...,...,...,...,...,...,...,...,...,...
154853,0.927321,-1.125450,0.402726,1.103504,1.088060,0.965036,4.0,4.0,1.0
154854,-0.628322,-2.048591,0.923961,-0.518895,-0.571949,-0.659020,3.0,1.0,2.0
154855,-0.585110,-1.310078,3.530132,-0.482842,-0.454025,-0.572017,1.0,1.0,2.0
154856,-0.477079,0.443890,-1.682210,-0.338628,-0.281674,-0.267506,4.0,5.0,4.0


In [24]:
X_test

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-0.152987,-0.017680,1.966429,0.039932,-0.054897,-0.006497,1.0,1.0,4.0
1,-0.822778,-1.125450,-0.118508,-0.843375,-0.816868,-0.905529,2.0,1.0,4.0
2,-0.865990,-0.386937,-1.160976,-0.897455,-0.871295,-0.905529,4.0,3.0,5.0
3,0.603228,-0.109994,-0.118508,0.770011,0.752429,0.747529,4.0,3.0,2.0
4,-0.693141,0.166948,-1.160976,-0.690148,-0.653589,-0.644520,4.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...
38710,-1.060445,-0.017680,-0.639742,-1.257988,-1.297636,-1.268041,4.0,1.0,3.0
38711,-0.606716,1.920917,0.402726,-0.591001,-0.562878,-0.412511,1.0,2.0,3.0
38712,1.748354,0.351576,-0.118508,1.554171,1.623254,1.617559,3.0,5.0,2.0
38713,0.473591,0.443890,0.923961,0.562705,0.589150,0.617024,3.0,1.0,3.0


In [25]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [26]:
import numpy as np

def evaluate_model(actual, predicted):
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(actual, predicted)
    return mae, mse, rmse, r2

In [30]:
models = {
    "LinearRegression": LinearRegression(),
    "RidgeRegression": Ridge(),
    "LassoRegression": Lasso(),
    #"ElasticNetRegression": ElasticNet
}

trained_model_list = []
model_list = []
r2_score_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mae, mse, rmse, r2 = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model training performance")
    print("RMSE: ", rmse)
    print("MAE :", mae)
    print("R2_Score :", r2*100)

    r2_score_list.append(r2)

    print("="*35)



LinearRegression
Model training performance
RMSE:  1015.2644852831206
MAE : 675.4047722396339
R2_Score : 93.66573147331785
RidgeRegression
Model training performance
RMSE:  1015.2682224415356
MAE : 675.4357996338812
R2_Score : 93.6656848407234
LassoRegression
Model training performance
RMSE:  1015.3391594019012
MAE : 676.6807706264747
R2_Score : 93.66479965048285


In [31]:
print(model_list) 
print(r2_score_list) 

['LinearRegression', 'RidgeRegression', 'LassoRegression']
[0.9366573147331785, 0.9366568484072341, 0.9366479965048286]
