In [1]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import r2_score, mean_squared_error

import pandas as pd 
import numpy as np 
import joblib

In [2]:
red_wine_df = pd.read_csv('data/Red.csv')

red_wine_df.head(2)

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017


In [3]:
red_wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8666 entries, 0 to 8665
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             8666 non-null   object 
 1   Country          8666 non-null   object 
 2   Region           8666 non-null   object 
 3   Winery           8666 non-null   object 
 4   Rating           8666 non-null   float64
 5   NumberOfRatings  8666 non-null   int64  
 6   Price            8666 non-null   float64
 7   Year             8666 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 541.8+ KB


In [4]:
X = red_wine_df[['Country', 'Region', 'Price']]
y = red_wine_df['Rating']

In [5]:
ct = make_column_transformer(
    (OrdinalEncoder(), ['Region']),
    (StandardScaler(), ['Price']), 
    (OneHotEncoder(), ['Country'])
)

print(ct)

ColumnTransformer(transformers=[('ordinalencoder', OrdinalEncoder(),
                                 ['Region']),
                                ('standardscaler', StandardScaler(), ['Price']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Country'])])


In [6]:
pipeline_wine = Pipeline(
    [('ct', ct), ('rfr', RandomForestRegressor(random_state=42))]
    )

In [7]:
pipeline_wine.fit(X,y)

In [8]:
pd.DataFrame(
    pipeline_wine['ct'].transform(X).toarray(), 
    columns = ['Price'] + ['Region'] + pipeline_wine['ct'].transformers_[2][1].get_feature_names_out().tolist()
    )

Unnamed: 0,Price,Region,Country_Argentina,Country_Australia,Country_Austria,Country_Brazil,Country_Bulgaria,Country_Canada,Country_Chile,Country_China,...,Country_Portugal,Country_Romania,Country_Slovakia,Country_Slovenia,Country_South Africa,Country_Spain,Country_Switzerland,Country_Turkey,Country_United States,Country_Uruguay
0,408.0,0.657648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,286.0,-0.278402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,542.0,-0.373184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,41.0,-0.358231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,104.0,-0.117684,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8661,290.0,-0.266981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8662,309.0,-0.224358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8663,250.0,-0.178910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8664,509.0,-0.387784,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
joblib.dump(pipeline_wine, 'pipeline_wine.pkl')

['pipeline_wine.pkl']

In [10]:
def rmse(y_hat, y):
    return mean_squared_error(y_hat, y, squared = False)

In [11]:
wine_test = pd.read_csv('data/Red_test.csv')

x_test = wine_test[['Country', 'Region', 'Price']]
y_test = wine_test['Rating']

In [12]:
y_pred = pipeline_wine.predict(x_test)

**Задание 6.1**

In [27]:
print('Метрика RMSE для тестового набора данных составляет: {}'.format(
    round(rmse(y_pred, y_test), 4))
      )

Метрика RMSE для тестового набора данных составляет: 0.0765


**Задание 6.2**

In [28]:
pipeline_wine.set_params(rfr__n_estimators = 200)

In [30]:
pipeline_wine.fit(X,y)

In [31]:
y_pred_n_estim_200 = pipeline_wine.predict(x_test)

print('Метрика RMSE для тестового набора данных составляет: {}'.format(
    round(rmse(y_pred_n_estim_200, y_test), 4))
      )


Метрика RMSE для тестового набора данных составляет: 0.0761


**Задание 6.3**

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.tree import DecisionTreeRegressor

estimators = [
    ('rcv', RidgeCV()),
    ('dtr', DecisionTreeRegressor(random_state=42))
    ]

stack_reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(
        n_estimators=10,
        random_state=42)
    )

# Требуется повторная трансформация исходных данных, т.к. образовавшаяся матрица сильно разрежена, и возникает ошибка обработки 
# на этапе передачи StackingRegressor-ом разреженной матрицы в final_estimator. sparse_output=False делает матрицу более "плотной"
ct = make_column_transformer(
    (OrdinalEncoder(), ['Region']),
    (StandardScaler(), ['Price']), 
    (OneHotEncoder(sparse_output=False), ['Country'])  # ← sparse_output=False
)

pipeline_stack_reg = Pipeline(
    [('ct', ct), ('stack_reg', stack_reg)]
)

pipeline_stack_reg.fit(X, y)

In [None]:
pipeline_stack_reg.fit(X, y)

In [38]:
y_pred_stack_reg = pipeline_stack_reg.predict(x_test)

print('Метрика RMSE для тестового набора данных составляет: {}'.format(
    round(rmse(y_pred_stack_reg, y_test), 2))
      )


Метрика RMSE для тестового набора данных составляет: 0.18
