Here we will use Linear Regression , Ridge Regression , Lasso Regression and ElasticNet Regression
for the model training.

In [1]:
import pandas as pd 

In [12]:
# Importing the data set.
df = pd.read_csv("Data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


ID column is nothing but the serial number and we dont see any scope or relation with the price and id,
so we can drop the particular column.

In [13]:
# Droping the id column.
df = df.drop(labels= "id" , axis=1)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


Convert the columns to independent and dependent columns  
The target or we are predicting the Price of the diamond and its price depends on carat ,cut, clarity and other columns so price is dependent columns and changes according to other values.  
The  other columns like (clarity, carat, cut etc...) are independent columns 

In [29]:
# X is independent columns and Y is dependent column
X = df.iloc[:,:-1]  # iloc[rows,solumns]
Y = df.iloc[:,-1]


X is the list of independent columns name.  
Y is the dependent column name.  

Now again divide the independent data into numerical data  columns and categorical data columns

In [41]:
# List of categorical columns
categorical_columns = X.select_dtypes(include="object").columns
# List of numerical data columns
numerical_columns = X.select_dtypes(exclude = "object").columns

In [42]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [48]:
# SimpleImputer helps in HAndling Missing Values with the help of mean, median, or mode this will be done.
from sklearn.impute import SimpleImputer  
# DtandardScaler function helps in HAndling Feature Scaling
from sklearn.preprocessing import StandardScaler 
# Ordinal Encoding function helps in maping the data with categorical data.
from sklearn.preprocessing import OrdinalEncoder 
# pipline function is used to create the pipelines 
from sklearn.pipeline import Pipeline
# ColumnTransformer function is used in combining the pipelines.
from sklearn.compose import ColumnTransformer

Create a Pipelines(Pipeline is the combination of the multiple steps.)  
First a Numerical Pipeline and the steps will be adding the missing values, doing scaling.  
Second create a Categorical Pipeline and the steps will be adding the missing values, creating the order or ranke for the data, and doing scaling.  

The StandardScaler() function in scikit-learn is used to standardize features by removing the mean and scaling to unit variance. This means that each feature will have a mean of 0 and a standard deviation of 1. Standardizing features is a common preprocessing step for machine learning algorithms, as it can help to improve the performance of the algorithm.

ColumnTransformer is used to combine the numerical pipeline and categorical pipeline.  

In [49]:
# Numerical Pipeline
numerical_pipeline = Pipeline(
    steps = [
         # For handeling the missing values of Numerical data
        ("imputer",SimpleImputer(strategy= "median")), 
        #standardize features by removing the mean and scaling to unit variance. 
        # This means that each feature will have a mean of 0 and a standard deviation of 1
        ("scaler",StandardScaler())
    ]
)

# Categorical Pipeline
categorical_pipeline = Pipeline(
    steps = [
         # For handeling the missing values of categorical data
        ("imputer",SimpleImputer(strategy="most_frequent")),
        # Used for giving the rank or the values for the different category.
        ("OrdinalEncoding",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        #standardize features by removing the mean and scaling to unit variance. 
        # This means that each feature will have a mean of 0 and a standard deviation of 1
        ("scaler",StandardScaler())
    ]
)


preprocessor = ColumnTransformer(
    [
        # Inserting the pipelines to combine them together.
       ("numerical_pipeline", numerical_pipeline, numerical_columns),
       ("categorical_pipeline", categorical_pipeline, categorical_columns)
    ]
)

Split the data into Training data and Test data with train_test_split function.  

In [89]:
# Train_Test_Split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

Fit and Transform the X_Train data set and we need to do only Transform for X_test data set.

In [90]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [91]:
X_train.head()

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.08097,-1.12315,0.874076,1.528722,1.352731
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,-2.144558,-0.935071,-0.646786
2,0.494617,0.815855,0.3998,0.570855,0.606458,0.673737,-0.132136,0.296826,0.686225
3,-1.018676,0.260701,0.921131,-1.214034,-1.24427,-1.195605,-0.132136,0.296826,0.01972
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,0.874076,2.14467,1.352731


Training the model with the Liner Regression, Lasso, Ridge, EleasticNet Regression.  

In [92]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [94]:
regression = LinearRegression()
regression.fit(X_train,Y_train)

In [95]:
regression.coef_

array([ 6433.66003594,  -132.75843566,   -70.42922179, -1720.30971463,
        -499.29302619,   -63.39317848,    72.44537247,  -460.41604642,
         650.76431652])

In [96]:
regression.intercept_

3970.7662895476774

In [97]:
import numpy as np
def evaluate_model(true, predicted):
    """
    true = The Y_test value
    predicted = the predicted value 
    """
    MAE = mean_absolute_error(true, predicted)
    MSE = mean_squared_error(true, predicted)
    RMSE = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return MAE, MSE, RMSE, r2_square

Training the model with different regression methods.

In [109]:
# Train multiple models

# Different types of regression methods.
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}

trained_model_list=[] 
model_list=[]
r2_list=[]


for i in range(len(list(models))):
    model=list(models.values())[i] # this will get the model name like Linear regression , lasso etc... from list.
    model.fit(X_train,y_train)  # Training the model with all the regression technique one by one.

    # Predicting the model output.
    y_pred=model.predict(X_test)  # While predicting we have to pass X_test dataset.

    MAE, MSE, RMSE, r2_square = evaluate_model(Y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",RMSE)
    print("MAE:",MAE)
    print("MAE:",MSE)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square * 100)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1013.9047094344002
MAE: 674.025511579685
MAE: 1028002.7598132554
R2 score 93.68908248567512


Lasso
Model Training Performance
RMSE: 1013.8784226767013
MAE: 675.071692336216
MAE: 1027949.4559693959
R2 score 93.68940971841704


Ridge
Model Training Performance
RMSE: 1013.9059272771631
MAE: 674.0555800798204
MAE: 1028005.229367764
R2 score 93.6890673250594


Elasticnet
Model Training Performance
RMSE: 1533.4162456064048
MAE: 1060.7368759154729
MAE: 2351365.382289642
R2 score 85.56494831165182


