In [365]:
#импорт нужных библиотек
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
import matplotlib.pyplot as plt
%matplotlib inline

In [366]:
#загрузка набора данных
dataset = pd.read_csv("Diamonds_train.csv")
test_data = pd.read_csv("Diamonds_test.csv")

In [367]:
#Предобработка данных
#1. Изучение загруженных данных
dataset.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [368]:
#1.1 Удаление первого ненужного столбца
dataset = dataset.loc[:, 'carat':]
test_data = test_data.loc[:, 'carat':]
dataset.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [369]:
#1.2 Количество строк и столбцов в датасете
dataset.shape

(43152, 10)

In [370]:
#2. Анализ пропущенных значений
#2.1 Поиск столбцов с пропущенными значениями
for col in dataset.columns:
    print(col, dataset[col].isnull().values.any())

carat False
cut False
color False
clarity False
depth True
table False
price False
x False
y False
z False


In [371]:
#2.2 Заполнение проппущенных значений
#Значения пропущены только в одном столбце, где представлены численные признаки. Заполню их значением медианы столбца
med = dataset['depth'].median()
dataset['depth'] = dataset['depth'].fillna(med)
dataset.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,61.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [372]:
#3. Удаление выбросов
numeric_col = ['carat', 'depth', 'table', 'x', 'y', 'z']
main_characteristics = []
to_del = []
for col in numeric_col:
    a = dataset[col].unique()
    q1 = dataset[col].quantile(0.25) - 1.5*(dataset[col].quantile(0.75)-dataset[col].quantile(0.25))
    q2 = dataset[col].quantile(0.75) + 1.5*(dataset[col].quantile(0.75)-dataset[col].quantile(0.25))
    upper_1 = (np.where(dataset[col] > q2)[0]).tolist()
    lower_1 = (np.where(dataset[col] < q1)[0]).tolist()
    to_del += upper_1 + lower_1
to_del = set(to_del)
to_del = list(to_del)
to_del.sort()
dataset.drop(to_del, inplace=True)

In [373]:
#Новая размерность датасета
dataset.shape

(40225, 10)

In [374]:
#4. One-hot encoding
dataset = pd.get_dummies(dataset, columns=['cut', 'color', 'clarity'])
dataset.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,326,3.95,3.98,2.43,0,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0.21,59.8,61.0,326,3.89,3.84,2.31,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.29,62.4,58.0,334,4.2,4.23,2.63,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,0.31,63.3,58.0,335,4.34,4.35,2.75,0,1,0,...,0,1,0,0,0,1,0,0,0,0
5,0.24,62.8,57.0,336,3.94,3.96,2.48,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [375]:
test_data = pd.get_dummies(test_data, columns=['cut', 'color', 'clarity'])

In [376]:
#5. Normalization
price = dataset.loc[:, 'price']
price_1 = test_data.loc[:, 'price']

dataset_without_price = dataset
dataset_without_price_1 = test_data

dataset_without_price.pop('price')
dataset_without_price_1.pop('price')

scale = preprocessing.MinMaxScaler()

for col in dataset_without_price.columns:
    dataset_without_price[col] = dataset_without_price[col].astype('float')
    dataset_without_price_1[col] = dataset_without_price_1[col].astype('float')
    
    dataset_without_price[[col]] = scale.fit_transform(dataset_without_price[[col]])
    dataset_without_price_1[[col]] = scale.transform(dataset_without_price_1[[col]])
dataset_without_price_1.head()

Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.15534,0.589286,0.369748,0.285421,0.304527,0.503546,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.15534,0.553571,0.453782,0.281314,0.300412,0.498818,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.15534,0.196429,0.621849,0.297741,0.31893,0.486998,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.145631,1.053571,0.621849,0.238193,0.259259,0.501182,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.165049,0.232143,0.453782,0.342916,0.320988,0.50591,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [377]:
#Раздление на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(dataset_without_price, price, 
                                                    test_size=0.25,
                                                    random_state = 7)

In [378]:
#Обучение модели через линейную регрессию
reg = LinearRegression()
parameters = {'fit_intercept': [True],
             'normalize': [True],
             'n_jobs': [0, 1, 2, 3, 4, 5, 6]}
regressor = GridSearchCV(reg, parameters)
regressor.fit(X_train, y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

GridSearchCV(estimator=LinearRegression(),
             param_grid={'fit_intercept': [True],
                         'n_jobs': [0, 1, 2, 3, 4, 5, 6], 'normalize': [True]})

In [379]:
#Проверка полученных результатов
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = median_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [380]:
print(f'MSE = {mse}, MAE = {mae}, R2 = {r2}')
#Поскольку R2 близко к 1, модель вполне неплохая

MSE = 1254585.7002744512, MAE = 582.0625, R2 = 0.9284021923024592


In [381]:
#Проверка полученных результатов
y_pred = regressor.predict(dataset_without_price_1)
mse = mean_squared_error(price_1, y_pred)
mae = median_absolute_error(price_1, y_pred)
r2 = r2_score(price_1, y_pred)

print(f'MSE = {mse}, MAE = {mae}, R2 = {r2}')
#Поскольку R2 близко к 1, модель вполне неплохая

MSE = 794210.8883583032, MAE = 509.625, R2 = -1.3307883095643187
