In [23]:
import pandas as pd

In [24]:
df = pd.read_csv("C:/Users/raja soni/Desktop/FSDS 2022/Projects/my_regression_gemstone/notebook/data/gemstone.csv")

In [25]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [26]:
df = df.drop(labels = ['id'], axis =1)

In [7]:
X = df.drop(labels = ['price'], axis = 1)
y = df[['price']]

In [34]:
categorical_cols = X.select_dtypes(include = 'object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [35]:
df['clarity'].unique()

array(['VS2', 'SI2', 'VS1', 'SI1', 'IF', 'VVS2', 'VVS1', 'I1'],
      dtype=object)

In [36]:
#Ordinal categorical features
cut_categories = ['Fair','Good', 'Very Good', 'Premium', 'Ideal' ]
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [37]:
from sklearn.impute import SimpleImputer # to handle missing values
from sklearn.preprocessing import StandardScaler # for feature scaling (higher values to make it in range)
from sklearn.preprocessing import OrdinalEncoder # For rank type categorical data
from sklearn.pipeline import Pipeline # to connect all the three above mentioned layer(missing value, feature scaling, feature Eng)

#to combine all the connect layers we use compose

from sklearn.compose import ColumnTransformer


In [38]:
# pipelining of numerical col and categorical cols

numerical_pip = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),


    ]

)

#Pipeling for categrical cols

categorical_pip = Pipeline(
    steps= [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
        ('scaler', StandardScaler())


    ]
)

# merge both the pipelining

preprocessor = ColumnTransformer(
    [
        ('numerical_pip', numerical_pip, numerical_cols),
        ('categorical_pip', categorical_pip, categorical_cols)
    ]
)

In [39]:
# train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [40]:
# fit- tranform the train and transform the test 

X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [41]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [42]:
model = LinearRegression()

In [43]:
model.fit(X_train, y_train)

In [45]:
X_train

Unnamed: 0,numerical_pip__carat,numerical_pip__depth,numerical_pip__table,numerical_pip__x,numerical_pip__y,numerical_pip__z,categorical_pip__cut,categorical_pip__color,categorical_pip__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.874100,-0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.874100,-0.320880,2.017037
4,-0.995648,0.258230,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.526720,-0.648127
...,...,...,...,...,...,...,...,...,...
135496,-0.629077,-1.500179,1.964434,-0.546492,-0.518125,-0.644575,-1.137644,-0.936747,-0.648127
135497,2.411307,0.443325,2.485700,1.919078,1.872797,1.930288,-1.137644,-0.320880,-0.648127
135498,0.923460,0.906065,0.400636,0.992240,0.921862,1.047891,-0.131772,0.294987,0.018164
135499,-1.038774,-0.667249,-0.641897,-1.212375,-1.197364,-1.252127,-1.137644,0.294987,2.017037


In [46]:
model.coef_

array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
          652.10059539]])

In [47]:
#Model training
#Automate the process

import numpy as np
def Evaluate_model(true, predicted):
    MSE = mean_squared_error(true, predicted)
    MAE = mean_absolute_error(true, predicted)
    R2 = r2_score(true, predicted)
    RMSE = np.sqrt(MSE)

    return MSE, MAE, R2, RMSE




In [51]:
#Train multiple models

Models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

model_list = []
R2_list = []

for i in range(len(Models)):
    model = list(Models.values())[i]
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    MSE, MAE, R2, RMSE = Evaluate_model(y_test, y_pred)

    model_list.append(list(Models.keys())[i])
    R2_list.append(R2)
    print("Model:" , list(Models.keys())[i])
    print("MSE:", MSE)
    print("MAE:", MAE)
    print("RMSE:", RMSE)
    print("R2:", R2)
    print("*"*34)
    print("\n")

    

Model: LinearRegression
MSE: 1029473.3531156846
MAE: 675.0758270067483
RMSE: 1014.6296630375463
R2: 0.9362906819996049
**********************************


Model: Lasso
MSE: 1029533.150650549
MAE: 676.2421173665509
RMSE: 1014.6591302750638
R2: 0.9362869814082755
**********************************


Model: Ridge
MSE: 1029482.810126896
MAE: 675.1077629781366
RMSE: 1014.6343233534415
R2: 0.9362900967491631
**********************************


Model: ElasticNet
MSE: 2351174.8713978743
MAE: 1060.9432977143008
RMSE: 1533.3541245902313
R2: 0.8544967219374031
**********************************


