In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [4]:
df = pd.read_csv("diamond.csv")
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,-1.469676,Ideal,E,SI2,61.5,55.0,5.786897,3.95,1.60543,1.23256
1,-1.560648,Premium,E,SI1,59.8,61.0,5.786897,3.89,1.576915,1.196948
2,-1.469676,Good,E,VS1,56.9,65.0,5.78996,4.05,1.623341,1.196948
3,-1.237874,Premium,I,VS2,62.4,58.0,5.811141,4.2,1.654411,1.289233
4,-1.171183,Good,J,SI2,63.3,58.0,5.814131,4.34,1.677097,1.321756


In [5]:
X = df.drop("price", axis = 1)
y = df["price"]

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

In [31]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

In [26]:
train =[]
test = []
cv = []
mae = []

step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components = 23)
model = LinearRegression()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

for i in range(0, 100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = i)
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    train_accuracy = pipe.score(X_train, y_train)
    test_accuracy = pipe.score(X_test, y_test)
    score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()
    mae_value = mean_absolute_error(y_test, pred)
    train.append(train_accuracy)
    test.append(test_accuracy)
    cv.append(score)
    mae.append(mae_value)

random = pd.DataFrame({
    "train": train,
    "test": test,
    "cv": cv,
    "mae": mae
})

In [30]:
random.sort_values(by = "mae")

Unnamed: 0,train,test,cv,mae
63,0.982527,0.983702,0.982493,0.100715
29,0.982552,0.983581,0.982524,0.101262
70,0.982556,0.983541,0.982506,0.101265
43,0.982562,0.983538,0.982526,0.101573
8,0.982663,0.983146,0.982623,0.101731
...,...,...,...,...
52,0.982933,0.982076,0.982905,0.104451
96,0.982712,0.982939,0.982667,0.104537
58,0.982893,0.982231,0.982846,0.105006
33,0.982857,0.982379,0.982806,0.105069


***Train_Test_Split***

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 63)

# ***Hyperparameter Tuning***

***PCA***

In [33]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA()
model = LinearRegression()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"pca__n_components": list(range(1, 50))}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

{'pca__n_components': 23}

***PolynomialFeatures***

In [35]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = PolynomialFeatures()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__degree": list(range(1, 10))}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

{'model__degree': 1}

***Ridge***

In [34]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = Ridge()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__alpha": [0.001, 0.1, 1, 2, 3, 4, 5, 10, 15, 20]}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

{'model__alpha': 0.1}

***Lasso***

In [36]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = Lasso()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__alpha": [0.001, 0.1, 1, 2, 3, 4, 5, 10, 15, 20]}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

{'model__alpha': 0.001}

***ElasticNet***

In [37]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = ElasticNet()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__alpha": [0.001, 0.1, 1, 2, 3, 4, 5, 10, 15, 20], "model__l1_ratio":[0, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 1]}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

{'model__alpha': 0.001, 'model__l1_ratio': 0.3}

***SVR***

In [39]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = SVR()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__C": [0.001, 0.1, 1, 10], "model__kernel":["linear", "sigmoid", "poly", "rbf"]}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

***KNeighborsRegressor***

In [40]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = KNeighborsRegressor()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__n_neighbors": list(range(1, 10))}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

{'model__n_neighbors': 5}

***DecisionTreeRegressor***

In [42]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = DecisionTreeRegressor()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__max_depth": list(range(9, 20))}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

{'model__max_depth': 12}

***RandomForestRegressor***

In [44]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = RandomForestRegressor()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__n_estimators": list(range(9, 15))}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

{'model__n_estimators': 14}

***AdaBoostRegressor***

In [45]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = AdaBoostRegressor()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__n_estimators": list(range(10, 20))}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

{'model__n_estimators': 19}

***GradientBoostingRegressor***

In [46]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = GradientBoostingRegressor()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__n_estimators": list(range(19, 30))}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_

{'model__n_estimators': 29}

***XGBRegressor***

In [None]:
step = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1, 2, 3]),
    ("scaler", StandardScaler(), [0, 4, 5, 6, 7, 8])
], remainder = "passthrough")

dr = PCA(n_components= 23)
model = XGBRegressor()

pipe = Pipeline([
    ("step", step),
    ("pca", dr),
    ("model", model)
])

param_grid = {"model__n_estimators": list(range(1, 10)), "model__learning_rate": [0, 0.2, 0.3, 0.5, 0.7, 1]}

grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = "neg_mean_squared_error")
grid.fit(X_train, y_train)
grid.best_params_