In [91]:
import pandas as pd
import numpy as np

In [92]:
df=pd.read_csv("gemstone.csv")

In [93]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [94]:
df.drop("id",axis=1,inplace=True)

In [95]:
# divide data into dependent & Independent 
X=df.drop("price",axis=1)
y=df["price"]

In [96]:
categorical=list(X.columns[X.dtypes=="object"])
numerical=list(X.columns[X.dtypes!="object"]) # X.select_dtypes(exclude="object").columns
cut=['Fair','Good', 'Very Good','Premium', 'Ideal']
color=['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [97]:
# Sklearn libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [98]:
## Num_pipeline,Cat_pipeline & column_Transofrmer
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
        
    ]
)

cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("ordinalencoder",OrdinalEncoder(categories=[cut,color,clarity])),
        ("scaler",StandardScaler())
   ]

)
    
preprocessor=ColumnTransformer([
    ("num_pipeline",num_pipeline,numerical),
    ("cat_pipeline",cat_pipeline,categorical)
]
)


In [99]:
# train test split
from sklearn.model_selection import train_test_split


In [100]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [37]:
X_train= pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test= pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [52]:
# Model trianing

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,accuracy_score


In [47]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [48]:
regression.coef_

array([ 6433.66003594,  -132.75843566,   -70.42922179, -1720.30971463,
        -499.29302619,   -63.39317848,    72.44537247,  -460.41604642,
         650.76431652])

In [49]:
regression.intercept_

3970.7662895476774

In [84]:
def evaluate(actual,pred):
    mae=mean_absolute_error(actual,pred)
    mse=mean_squared_error(actual,pred)
    rmse=np.sqrt(mean_squared_error(actual,pred))
    r_square=r2_score(actual,pred)

    return mae,mse,rmse,r_square



In [88]:
evaluate(y_test,regression.predict(X_test))

(674.0255115796832, 1028002.7598132559, 1013.9047094344004, 0.9368908248567511)

In [89]:
# testing multiple models

models={"Liner_Regression":LinearRegression(),"Lasso":Lasso(),"Ridge":Ridge(),"Elasticnet":ElasticNet()}


model_list=[]
rsquare=[]

for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    # prediction
    y_pred=model.predict(X_test)

    mae,mse,rmse,r_square=evaluate(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performace")
    print("RMSE",rmse)
    print("MAE",mae)
    print("r2 score",r_square*100)

    rsquare.append(rsquare)


    print("="*35)
    print("\n")






Liner_Regression
Model Performace
RMSE 1013.9047094344004
MAE 674.0255115796832
r2 score 93.68908248567512


Lasso
Model Performace
RMSE 1013.8784226767013
MAE 675.0716923362162
r2 score 93.68940971841704


Ridge
Model Performace
RMSE 1013.9059272771639
MAE 674.0555800798253
r2 score 93.68906732505938


Elasticnet
Model Performace
RMSE 1533.4162456064048
MAE 1060.7368759154729
r2 score 85.56494831165182


