In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df=pd.read_csv(r'C:\Users\Usama\Desktop\houseprice\notebooks\data\gemstone.csv').drop(columns=['id'])
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [8]:
X=df.iloc[:,:-1]

In [12]:
y=df[['price']]

In [14]:
numerical_columns=X.select_dtypes([int,float]).columns
categorical_columns=X.select_dtypes('O').columns

In [25]:
cut_categories=['Fair','Good','Very Good','Premium','Ideal']
color_categories=['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']


In [85]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [111]:
num_pipeline=Pipeline(
    steps=[
                    ('imputer',SimpleImputer(strategy='median')),
                    ('scaler',StandardScaler())
        ]
    )

In [105]:
cat_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                ('encoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
                ('scaler',StandardScaler())])
cat_pipeline

In [106]:
preprocessor=ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_columns),
        ('cat_pipeline',cat_pipeline,categorical_columns)
    ]
)


preprocessor

In [112]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [115]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=X.columns)
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=X.columns)

In [117]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [164]:
def evaluate_model(true,predicted):
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    rmse=mse**0.5
    r2_square=r2_score(true,predicted)

    #result=[mse,mae,rmse,r2_square]

    #return result

    return mae,rmse,r2_square

In [169]:
models={
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'ElasticNet':ElasticNet()
}


model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)



    y_pred=model.predict(X_test)
    mae,rmse,r2_square=evaluate_model(y_test,y_pred)



    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


    print('Model Training Performance')
    print('RMSE',rmse)
    print('MAE',mae)
    print('R2 Score',r2_square)


    r2_list.append(r2_square)

    print('='*35,end='\n\n')

LinearRegression
Model Training Performance
RMSE 1014.6296630375466
MAE 675.0758270067466
R2 Score 0.9362906819996047

Ridge
Model Training Performance
RMSE 1014.634323353442
MAE 675.107762978146
R2 Score 0.9362900967491631

Lasso
Model Training Performance
RMSE 1014.6591302750638
MAE 676.2421173665509
R2 Score 0.9362869814082755

ElasticNet
Model Training Performance
RMSE 1533.3541245902309
MAE 1060.9432977143006
R2 Score 0.8544967219374031

