<h1><b>Задача по предсказанию рейтинга шоколада

<h3><b>Никнейм на Kaggle: 

# Описание задачи

In [1]:
# Даны характеристики шоколадок и по ним нужно предсказать их рейтинг.

# Описание данных
# Company - компания производитель
# Specific Bean Origin - географический регион происхождения
# REF - параметр, определяющий момент внесения записи в базу данных (чем выше значение, тем "свежее" запись)
# Review - дата публикации отзыва
# Cocoa Percent - процентное содержание какао
# Company Location - страна производитель
# Rating - экспертный рейтинг
# Bean Type - используемый сорт какао-бобов, если таковой имеется
# Broad Bean Origin - географический регион происхождения какао-бобов

# Файлы с данными
# choco_train.csv - тренировочные данные
# choco_test_new.csv - тестовые данные
# choco_sample_submission.csv - пример submission

# Импорт библиотек, константы

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRanker, LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
import optuna

In [6]:
import association_metrics as am
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_percentage_error as MAPE, mean_absolute_error as MAE, r2_score, mean_squared_error as MSE

from sklearn.pipeline import Pipeline

In [7]:
RANDOM_STATE = 42

# Загрузка данных

In [13]:
TRAIN = "https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/chocolate_train.csv"
TEST = "https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/chocolate_test_new.csv"

In [14]:
train_df = pd.read_csv(TRAIN)

# Обзор данных

In [15]:
train_df.head()

Unnamed: 0,Company,Specific Bean Origin,REF,Review,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,Willie's Cacao,Rio Caribe,457,2009,72%,U.K.,3.25,Trinitario,Venezuela
1,Beschle (Felchlin),"Ocumare, Premier Cru, Quizas No. 2",508,2010,72%,Switzerland,3.5,,Venezuela
2,Dark Forest,Tanzania,1554,2015,70%,U.S.A.,3.0,,Tanzania
3,Brasstown aka It's Chocolate,Cooproagro,1125,2013,72%,U.S.A.,3.0,Trinitario,Dominican Republic
4,Pralus,"Java, Indonesie",32,2006,75%,France,3.5,Criollo,Indonesia


In [16]:
train_df.shape

(1255, 9)

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1255 entries, 0 to 1254
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Company               1255 non-null   object 
 1   Specific Bean Origin  1255 non-null   object 
 2   REF                   1255 non-null   int64  
 3   Review                1255 non-null   int64  
 4   Cocoa Percent         1255 non-null   object 
 5   Company Location      1255 non-null   object 
 6   Rating                1255 non-null   float64
 7   Bean Type             1254 non-null   object 
 8   Broad Bean Origin     1254 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 88.4+ KB


In [21]:
numeric_cols = train_df.select_dtypes(exclude='object').columns
cat_cols = train_df.select_dtypes(include='object').columns

In [26]:
train_df.describe()

Unnamed: 0,REF,Review,Rating
count,1255.0,1255.0,1255.0
mean,1045.152191,2012.38247,3.176494
std,551.284249,2.922499,0.478948
min,5.0,2006.0,1.0
25%,593.0,2010.0,2.75
50%,1077.0,2013.0,3.25
75%,1514.0,2015.0,3.5
max,1952.0,2017.0,5.0


In [27]:
train_df[cat_cols].describe()

Unnamed: 0,Company,Specific Bean Origin,Cocoa Percent,Company Location,Bean Type,Broad Bean Origin
count,1255,1255,1255,1255,1254.0,1254
unique,376,762,40,58,38.0,84
top,Soma,Madagascar,70%,U.S.A.,,Venezuela
freq,35,39,461,521,628.0,144


In [34]:
train_df['Bean Type'].describe().top

'\xa0'

In [55]:
(train_df == '\xa0').sum()

Company                   0
Specific Bean Origin      0
REF                       0
Review                    0
Cocoa Percent             0
Company Location          0
Rating                    0
Bean Type               628
Broad Bean Origin        55
dtype: int64

In [44]:
train_df['Specific Bean Origin'].nunique()

762

Промежуточный вывод: <br>

# Обработка пропусков и выбросов

## Обработка пропусков

In [60]:
train_df = train_df.replace('\xa0', np.NaN)

In [70]:
train_df[cat_cols].describe()

Unnamed: 0,Company,Specific Bean Origin,Cocoa Percent,Company Location,Bean Type,Broad Bean Origin
count,1255,1255,1255,1255,626,1199
unique,376,762,40,58,37,83
top,Soma,Madagascar,70%,U.S.A.,Trinitario,Venezuela
freq,35,39,461,521,285,144


In [71]:
train_df.isna().sum()

Company                   0
Specific Bean Origin      0
REF                       0
Review                    0
Cocoa Percent             0
Company Location          0
Rating                    0
Bean Type               629
Broad Bean Origin        56
dtype: int64

## Обработка выбросов

# Построение новых признаков

In [22]:
train_df.corr(numeric_only=True)

Unnamed: 0,REF,Review,Rating
REF,1.0,0.984938,0.112792
Review,0.984938,1.0,0.111826
Rating,0.112792,0.111826,1.0


# Отбор признаков

# Построение модели

# Вывод