In [114]:
import pandas as pd

In [115]:
df = pd.read_csv('data/diamonds.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [116]:
df = df.drop(labels= ['id'] , axis=1)

In [117]:
X = df.drop(labels= ['price'] , axis=1)
Y = df[['price']]

In [118]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74


In [119]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [120]:
Y

Unnamed: 0,price
0,326
1,326
2,327
3,334
4,335
...,...
53935,2757
53936,2757
53937,2757
53938,2757


In [121]:
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [122]:
cut_categories = ['Fair' , 'Good' , 'Very Good' , 'Premium' , 'Ideal']
color_categories = ['D' , 'E', 'F' , 'G' , 'H' , 'I' , 'J']
clarity_categories = ['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF']

In [123]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [124]:
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('OrdinalEncoder', OrdinalEncoder(categories=[ cut_categories , color_categories , clarity_categories])),  # Add your categories here
        ('scaler', StandardScaler())
    ]
)

In [125]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)
])

In [126]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X , Y, test_size= 0.33 , random_state=30)

In [127]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns= preprocessor.get_feature_names_out())

In [128]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.964651,-0.037683,-0.649373,-1.098078,-1.025265,-1.053258,0.980577,0.238945,1.561993
1,0.428494,-1.156238,2.041845,0.688282,0.638,0.509496,0.083708,1.41669,-0.143514
2,0.259628,-0.177503,-1.994982,0.473919,0.490732,0.453181,0.980577,-0.349928,-1.280519
3,-1.006867,0.521594,-1.546446,-1.20526,-1.137882,-1.109574,0.980577,-0.349928,2.698998
4,0.449602,0.801233,0.2477,0.563237,0.57736,0.664364,-0.81316,1.41669,-0.712016


In [129]:
from sklearn.linear_model import LinearRegression , Lasso , Ridge , ElasticNet
from sklearn.metrics import r2_score , mean_absolute_error , mean_squared_error

In [130]:
regression = LinearRegression()
regression.fit(X_train , y_train)

In [131]:
regression.coef_

array([[ 5131.62695985,  -175.60681786,   -94.56953073, -1239.51354986,
           68.83277469,    11.94171554,   181.01249886,  -492.33982979,
          490.42470543]])

In [132]:
regression.intercept_

array([3925.02177703])

In [133]:
import numpy as np
def evaluate_model(true , predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true , predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true , predicted)
    return mae , rmse , r2_square

In [134]:
models = {
    'LinearRegression' : LinearRegression(),
    'Lasso'  : Lasso(),
    'Elasticnet' : ElasticNet(),
    'Ridge' : Ridge()
    
}

trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train , y_train)


    y_pred = model.predict(X_test)

    mae , rmse , r2_sqaure = evaluate_model(y_pred , y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Training performance")
    print('RMSE:' , rmse)
    print("MAE:" , mae)
    print("R2 Score :" ,r2_sqaure*100)

    r2_list.append(r2_sqaure)

    print('=' *35)
    print('\n')

LinearRegression
Model Training performance
RMSE: 0.0
MAE: 0.0
R2 Score : 100.0


Lasso
Model Training performance
RMSE: 0.0
MAE: 0.0
R2 Score : 100.0


Elasticnet
Model Training performance
RMSE: 0.0
MAE: 0.0
R2 Score : 100.0


Ridge
Model Training performance
RMSE: 0.0
MAE: 0.0
R2 Score : 100.0


