In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df=pd.read_csv(r'C:\Users\Usama\Desktop\houseprice\notebooks\data\gemstone.csv').drop(columns=['id'])
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [6]:
df.sample(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
138745,0.73,Very Good,G,VS2,58.9,60.0,5.9,5.93,3.47,2846
26148,1.5,Good,I,SI2,63.8,58.0,7.22,7.18,4.59,6291
160635,0.5,Ideal,D,VS2,62.1,56.0,5.11,5.08,3.17,1760
155105,0.54,Ideal,D,VS2,61.9,56.0,5.2,5.22,3.23,1755
45334,1.05,Ideal,G,VS1,61.5,57.0,6.53,6.6,4.02,6889


In [8]:
X=df.iloc[:,:-1]

In [12]:
y=df[['price']]

In [14]:
numerical_columns=X.select_dtypes([int,float]).columns
categorical_columns=X.select_dtypes('O').columns

In [25]:
cut_categories=['Fair','Good','Very Good','Premium','Ideal']
color_categories=['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']


In [52]:
categorical_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [85]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [111]:
num_pipeline=Pipeline(
    steps=[
                    ('imputer',SimpleImputer(strategy='median')),
                    ('scaler',StandardScaler())
        ]
    )

In [105]:
cat_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                ('encoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
                ('scaler',StandardScaler())])
cat_pipeline

In [106]:
preprocessor=ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_columns),
        ('cat_pipeline',cat_pipeline,categorical_columns)
    ]
)


preprocessor

In [109]:
pd.DataFrame(preprocessor.fit_transform(X),columns=X.columns)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.576255,0.350768,0.402496,1.401353,1.460456,1.474414,-0.132929,-0.318009,0.016591
1,2.678513,0.165874,0.402496,2.113437,2.177120,2.200187,-1.138809,2.146431,-1.315169
2,-0.196003,-0.573702,-0.118652,-0.022816,0.008986,-0.049710,0.872951,0.298101,0.682471
3,-1.017293,-0.203914,-0.639801,-1.203614,-1.188477,-1.196432,0.872951,0.298101,0.682471
4,1.965288,0.720555,0.923644,1.743874,1.714463,1.793754,-0.132929,0.298101,0.016591
...,...,...,...,...,...,...,...,...,...
193568,-1.038906,-0.666149,-0.639801,-1.230655,-1.206620,-1.254494,0.872951,-1.550229,1.348351
193569,-0.196003,-1.405724,0.402496,0.031267,0.045273,-0.093257,-0.132929,0.298101,1.348351
193570,-0.131164,1.182790,-0.118652,0.004226,0.027129,0.124475,-1.138809,-0.318009,-0.649289
193571,-0.974067,0.997896,-1.160949,-1.140517,-1.115903,-1.051277,-1.138809,-1.550229,-0.649289


In [112]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [115]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=X.columns)
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=X.columns)

In [117]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [119]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [120]:
regression.coef_

array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
          652.10059539]])

In [121]:
regression.intercept_

array([3976.8787389])

In [128]:
def evaluate_model(true,predicted):
    mse=mean_squared_error(true,predicted)
    mae=mean_absolute_error(true,predicted)
    rmse=mse**0.5
    r2_square=r2_score(true,predicted)

    result=[mse,mae,rmse,r2_square]

    return result

In [129]:
evaluate_model(y_test,regression.predict(X_test))

[1029473.3531156853, 675.0758270067466, 1014.6296630375466, 0.9362906819996047]

In [130]:
lasso_regression=Lasso()
lasso_regression.fit(X_train,y_train)
evaluate_model(y_test,lasso_regression.predict(X_test))

[1029533.150650549, 676.2421173665509, 1014.6591302750638, 0.9362869814082755]

In [131]:
Ridge_regression=Ridge()
Ridge_regression.fit(X_train,y_train)
evaluate_model(y_test,Ridge_regression.predict(X_test))

[1029482.8101268971, 675.107762978146, 1014.634323353442, 0.9362900967491631]

In [132]:
ElasticNet_regression=ElasticNet()
ElasticNet_regression.fit(X_train,y_train)
evaluate_model(y_test,ElasticNet_regression.predict(X_test))

[2351174.8713978734,
 1060.9432977143006,
 1533.3541245902309,
 0.8544967219374031]

In [137]:
pd.DataFrame([evaluate_model(y_test,regression.predict(X_test)),evaluate_model(y_test,regression.predict(X_test))])

Unnamed: 0,0,1,2,3
0,1029473.0,675.075827,1014.629663,0.936291
1,1029473.0,675.075827,1014.629663,0.936291
