In [None]:
import pandas as pd

diamonds = pd.read_csv("diamonds.csv")

In [None]:
diamonds.head()

*Removing column "Unnamed: 0"*

In [None]:
diamonds.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
diamonds

In [None]:
diamonds.info()

In [None]:
diamonds["cut"].value_counts()

In [None]:
diamonds["color"].value_counts()

In [None]:
diamonds["clarity"].value_counts()

In [None]:
diamonds.describe()

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

diamonds.hist(bins=25, figsize=(20, 15))

plt.show()

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(diamonds, test_size=0.2, random_state=42)

In [None]:
test_set.head()

In [None]:
diamonds["price"].hist()

In [None]:
import numpy as np

diamonds["price_cat"] = pd.cut(diamonds["price"],
                              bins=[0, 3750, 7500, 11250, 15000, np.inf],
                              labels=[1, 2, 3, 4, 5])

In [None]:
diamonds["price_cat"].hist()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(diamonds, diamonds["price_cat"]):
    strat_train_set = diamonds.loc[train_index]
    strat_test_set = diamonds.loc[test_index]

In [None]:
strat_test_set["price_cat"].value_counts() / len(strat_test_set)

In [None]:
diamonds["price_cat"].value_counts() / len(diamonds)

In [None]:
def price_cat_proportions(data):
    return data["price_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(diamonds, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": price_cat_proportions(diamonds),
    "Stratified": price_cat_proportions(strat_test_set),
    "Random": price_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [None]:
compare_props

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("price_cat", axis=1, inplace=True)

In [None]:
diamonds = strat_train_set.copy()

In [None]:
%matplotlib notebook

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot the points
scatter = ax.scatter(diamonds['x'], diamonds['y'], diamonds['z'], 
                     s=diamonds["carat"] * 100, label="carat",
                     c=diamonds["price"], cmap="jet", 
                     alpha=0.2)

# Set labels and title
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('3D Scatter Plot of Diamonds Data')

ax.set_xlim(0, 11)
ax.set_ylim(0, 60)
ax.set_zlim(0, 32)

cbar = fig.colorbar(scatter, ax=ax, label='Price')

plt.legend()

# Show plot
plt.show()

In [None]:
corr_matrix = diamonds.corr(numeric_only=True)

In [None]:
corr_matrix["price"].sort_values(ascending=False)

In [None]:
%matplotlib inline

from pandas.plotting import scatter_matrix

attributes = ["price", "carat", "x", "y", "z", "table"]

scatter_matrix(diamonds[attributes], figsize=(10, 10))

plt.show()

In [None]:
diamonds.plot(kind="scatter", x="carat", y="price", alpha=0.1)

In [None]:
diamonds["area"] = diamonds["x"] * diamonds["y"]
diamonds["volume"] = diamonds["x"] * diamonds["y"] * diamonds["z"]

In [None]:
corr_matrix = diamonds.corr(numeric_only="True")

In [None]:
corr_matrix["price"].sort_values(ascending=False)

In [None]:
diamonds = strat_train_set.drop("price", axis=1)
diamonds_labels = strat_train_set["price"].copy()

In [None]:
sample_incomplete_rows = diamonds[diamonds.isnull().any(axis=1)].head()
sample_incomplete_rows

In [None]:
diamonds_cat = diamonds[["cut"]]
diamonds_cat.head(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
diamonds_cat_encoded = ordinal_encoder.fit_transform(diamonds_cat)
diamonds_cat_encoded[:10]

In [None]:
ordinal_encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
diamonds_cat_1hot = cat_encoder.fit_transform(diamonds_cat)
diamonds_cat_1hot

In [None]:
diamonds_cat_1hot.toarray()

In [None]:
diamonds

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

col_names = "x", "y", "z"
x_ix, y_ix, z_ix = [diamonds.columns.get_loc(c) for c in col_names]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        area = X[:, x_ix] * X[:, y_ix]
        volume = X[:, x_ix] * X[:, y_ix] * X[:, z_ix]
        
        return np.c_[X, area, volume]
    
attr_adder = CombinedAttributesAdder()
diamonds_extra_attribs = attr_adder.transform(diamonds.values)

In [None]:
diamonds_extra_attribs = pd.DataFrame(
    diamonds_extra_attribs,
    columns=list(diamonds.columns) + ["area", "volume"],
    index=diamonds.index
)

diamonds_extra_attribs.head()

In [None]:
diamonds_num = diamonds.drop(["cut", "color", "clarity"], axis=1)

In [None]:
diamonds_num

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

col_names = "x", "y", "z"
x_ix, y_ix, z_ix = [diamonds_num.columns.get_loc(c) for c in col_names]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

diamonds_num_tr = num_pipeline.fit_transform(diamonds_num)

In [None]:
diamonds_num_tr

In [None]:
diamonds

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = ["carat", "depth", "table", "x", "y", "z"]
cat_attribs = ["cut", "color", "clarity"]

col_names = "x", "y", "z"
x_ix, y_ix, z_ix = [diamonds_num.columns.get_loc(c) for c in col_names]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

diamonds_prepared = full_pipeline.fit_transform(diamonds)

In [None]:
diamonds_prepared

In [None]:
diamonds_prepared.shape

In [None]:
diamonds_prepared

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(diamonds_prepared, diamonds_labels)

In [None]:
some_data = diamonds.iloc[:5]
some_labels = diamonds_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

In [None]:
print("Labels:", list(some_labels))

In [None]:
some_data_prepared

In [None]:
from sklearn.metrics import mean_squared_error

diamonds_predictions = lin_reg.predict(diamonds_prepared)
lin_mse = mean_squared_error(diamonds_labels, diamonds_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(diamonds_prepared, diamonds_labels)

In [None]:
diamonds_predictions = tree_reg.predict(diamonds_prepared)
tree_mse = mean_squared_error(diamonds_labels, diamonds_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, diamonds_prepared, diamonds_labels, scoring="neg_mean_squared_error", cv=10)

tree_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg, diamonds_prepared, diamonds_labels, scoring="neg_mean_squared_error", cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)

display_scores(lin_rmse_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(diamonds_prepared, diamonds_labels)

In [None]:
diamonds_predictions = forest_reg.predict(diamonds_prepared)
forest_mse = mean_squared_error(diamonds_labels, diamonds_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
forest_scores = cross_val_score(forest_reg, diamonds_prepared, diamonds_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [125, 150, 175, 200], 'max_features': [15, 20, 25, 30]},
    {'bootstrap': [False], 'n_estimators': [30, 128], 'max_features': [8, 14]}
]

forest_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(diamonds_prepared, diamonds_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
    'n_estimators': randint(low=1, high=200),
    'max_features': randint(low=1, high=8),
}

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                               n_iter=10, cv=5, scoring="neg_mean_squared_error", random_state=42)

rnd_search.fit(diamonds_prepared, diamonds_labels)

In [None]:
cvres = rnd_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attribs = ["area", "volume"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("price", axis=1)
y_test = strat_test_set["price"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

In [None]:
from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                        loc=squared_errors.mean(),
                        scale=stats.sem(squared_errors)
                        )
       )

# Exercises

### 1. Try a Support Vector Machine regressor (`sklearn.svm.SVR`) with various hyperparameters, such as `kernel="linear"` (with various values for the `C` hyperparameter) or `kernel="rbf"` (with various values for `C` and `gamma` hyperparameters). Donʼt worry about what these hyperparameters mean for now. How does the best SVR predictor perform?

In [None]:
from sklearn.svm import SVR

param_grid = [
    {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.]},
    {'kernel': ['rbf'], 'C': [1., 3., 10., 30., 100., 300., 1000.], 'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}
]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(diamonds_prepared, diamonds_labels)

In [None]:
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

In [None]:
grid_search.best_params_

### 2. Try replacing `GridSearchCV` with `RandomizedSearchCV`. 

In [None]:
print("test")

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

param_distribs = {
    'kernel': ['linear', 'rbf'],
    'C': reciprocal(20, 1000),
    'gamma': expon(scale=1.0),
}

svm_reg = SVR()
rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs, n_iter=50, cv=5, 
                                scoring='neg_mean_squared_error', verbose=2, random_state=42)

rnd_search.fit(diamonds_prepared, diamonds_labels)

In [None]:
negative_mse = rnd_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

In [None]:
rnd_search.best_params_

### 3. Try adding a transformer in the preparation pipeline to select only the most important attributes.

In [None]:
def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
        
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    
    def transform(self, X):
        return X[:, self.feature_indices_]

In [None]:
k = 5

In [None]:
top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices

In [None]:
np.array(attributes)[top_k_feature_indices]

In [None]:
sorted(zip(feature_importances, attributes), reverse=True)[:k]

In [None]:
preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

In [None]:
diamonds_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(diamonds)

In [None]:
diamonds_prepared_top_k_features[0:3]

In [None]:
diamonds_prepared[0:3, top_k_feature_indices]

### 4. Try creating a single pipeline that does the full data preparation plus the final prediction

In [None]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(**rnd_search.best_params_))
])

In [None]:
prepare_select_and_predict_pipeline.fit(diamonds, diamonds_labels)

In [None]:
some_data = diamonds.iloc[:4]
some_labels = diamonds_labels.iloc[:4]

print("Predictions:\t", prepare_select_and_predict_pipeline.predict(some_data))
print("Labels:\t\t", list(some_labels))

### 5. Automatically explore some preparations options using `GridSearchCV`.

In [None]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs)
])

prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(**rnd_search.best_params_))
])

param_grid = [{
    'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'feature_selection__k': list(range(1, len(feature_importances) + 1))
}]

grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5, 
                                scoring='neg_mean_squared_error', verbose=2)
grid_search_prep.fit(diamonds, diamonds_labels)

In [None]:
grid_search_prep.best_params_