# Машинное обучение на датасете "Обзоры и рейтинги видеоигр"
### Цель: предсказать рейтинг игры
Датасет: https://www.kaggle.com/datasets/jahnavipaliwal/video-game-reviews-and-ratings

### Загрузка датасета

In [16]:
import pandas as pd

# Загружаем данные из предоставленного файла
file_path = 'video_game_reviews.csv'
data = pd.read_csv(file_path)

# Выводим первые несколько строк 
data.head()

Unnamed: 0,Game Title,User Rating,Age Group Targeted,Price,Platform,Requires Special Device,Developer,Publisher,Release Year,Genre,Multiplayer,Game Length (Hours),Graphics Quality,Soundtrack Quality,Story Quality,User Review Text,Game Mode,Min Number of Players
0,Grand Theft Auto V,36.4,All Ages,41.41,PC,No,Game Freak,Innersloth,2015,Adventure,No,55.3,Medium,Average,Poor,"Solid game, but too many bugs.",Offline,1
1,The Sims 4,38.3,Adults,57.56,PC,No,Nintendo,Electronic Arts,2015,Shooter,Yes,34.6,Low,Poor,Poor,"Solid game, but too many bugs.",Offline,3
2,Minecraft,26.8,Teens,44.93,PC,Yes,Bungie,Capcom,2012,Adventure,Yes,13.9,Low,Good,Average,"Great game, but the graphics could be better.",Offline,5
3,Bioshock Infinite,38.4,All Ages,48.29,Mobile,Yes,Game Freak,Nintendo,2015,Sports,No,41.9,Medium,Good,Excellent,"Solid game, but the graphics could be better.",Online,4
4,Half-Life: Alyx,30.1,Adults,55.49,PlayStation,Yes,Game Freak,Epic Games,2022,RPG,Yes,13.2,High,Poor,Good,"Great game, but too many bugs.",Offline,1


In [17]:
# и общую информацию о данных
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47774 entries, 0 to 47773
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Game Title               47774 non-null  object 
 1   User Rating              47774 non-null  float64
 2   Age Group Targeted       47774 non-null  object 
 3   Price                    47774 non-null  float64
 4   Platform                 47774 non-null  object 
 5   Requires Special Device  47774 non-null  object 
 6   Developer                47774 non-null  object 
 7   Publisher                47774 non-null  object 
 8   Release Year             47774 non-null  int64  
 9   Genre                    47774 non-null  object 
 10  Multiplayer              47774 non-null  object 
 11  Game Length (Hours)      47774 non-null  float64
 12  Graphics Quality         47774 non-null  object 
 13  Soundtrack Quality       47774 non-null  object 
 14  Story Quality         

### Очистка данных

In [18]:
# Удаляем ненужные для анализа текстовые столбцы
columns_to_drop = ['Game Title', 'User Review Text', 'Developer', 'Publisher']
data_cleaned = data.drop(columns=columns_to_drop, axis=1)

# Проверим пропущенные значения
missing_values = data_cleaned.isnull().sum()

# Выводим уникальные значения для категориальных колонок
categorical_columns = data_cleaned.select_dtypes(include='object').columns
unique_values = {col: data_cleaned[col].unique() for col in categorical_columns}

missing_values, unique_values


(User Rating                0
 Age Group Targeted         0
 Price                      0
 Platform                   0
 Requires Special Device    0
 Release Year               0
 Genre                      0
 Multiplayer                0
 Game Length (Hours)        0
 Graphics Quality           0
 Soundtrack Quality         0
 Story Quality              0
 Game Mode                  0
 Min Number of Players      0
 dtype: int64,
 {'Age Group Targeted': array(['All Ages', 'Adults', 'Teens', 'Kids'], dtype=object),
  'Platform': array(['PC', 'Mobile', 'PlayStation', 'Xbox', 'Nintendo Switch'],
        dtype=object),
  'Requires Special Device': array(['No', 'Yes'], dtype=object),
  'Genre': array(['Adventure', 'Shooter', 'Sports', 'RPG', 'Simulation', 'Strategy',
         'Fighting', 'Action', 'Party', 'Puzzle'], dtype=object),
  'Multiplayer': array(['No', 'Yes'], dtype=object),
  'Graphics Quality': array(['Medium', 'Low', 'High', 'Ultra'], dtype=object),
  'Soundtrack Quality': arra

### Преобразование категориальных данных в числовой формат с помощью Label Encoding

In [19]:
from sklearn.preprocessing import LabelEncoder

# Применяем Label Encoding для категориальных переменных
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data_cleaned[col] = le.fit_transform(data_cleaned[col])
    label_encoders[col] = le

# Проверяем результат преобразования
data_cleaned.head()

Unnamed: 0,User Rating,Age Group Targeted,Price,Platform,Requires Special Device,Release Year,Genre,Multiplayer,Game Length (Hours),Graphics Quality,Soundtrack Quality,Story Quality,Game Mode,Min Number of Players
0,36.4,1,41.41,2,0,2015,1,0,55.3,2,0,3,0,1
1,38.3,0,57.56,2,0,2015,6,1,34.6,1,3,3,0,3
2,26.8,3,44.93,2,1,2012,1,1,13.9,1,2,0,0,5
3,38.4,1,48.29,0,1,2015,8,0,41.9,2,2,1,1,4
4,30.1,0,55.49,3,1,2022,5,1,13.2,0,3,2,0,1


### Разделение данные на тренировочную и тестовую выборки

In [20]:
from sklearn.model_selection import train_test_split

# Выделяем целевую переменную и признаки
X = data_cleaned.drop(columns=['User Rating'])
y = data_cleaned['User Rating']

# Разделяем на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Проверяем размеры выборок
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((38219, 13), (9555, 13), (38219,), (9555,))

### Обучение и оценка моделей
Используемые модели: 
- Линейная регрессия (Linear Regression)
- Случайный лес (Random Forest Regressor)
- Градиентный бустинг (Gradient Boosting Regressor)
- Ближайшие соседи (K-Nearest Neighbors Regressor)
- Гребневая регрессия (Ridge Regression)
    

In [21]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Инициализация моделей
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Ridge Regression": Ridge()
}

# Результаты моделей
results = {}

# Обучение и оценка каждой модели
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }

# Вывод результатов
results


{'Linear Regression': {'MAE': np.float64(0.9998381328025036),
  'MSE': np.float64(1.3381505203815343),
  'R2': 0.976890532857135},
 'Random Forest': {'MAE': np.float64(1.0359911041339611),
  'MSE': np.float64(1.4750624139194137),
  'R2': 0.9745261045981385},
 'Gradient Boosting': {'MAE': np.float64(1.0113913838804112),
  'MSE': np.float64(1.3833750178053434),
  'R2': 0.9761095190463941},
 'K-Nearest Neighbors': {'MAE': np.float64(1.0991334379905808),
  'MSE': np.float64(1.7219151020408163),
  'R2': 0.9702630310511933},
 'Ridge Regression': {'MAE': np.float64(0.9998381251146989),
  'MSE': np.float64(1.3381504835427038),
  'R2': 0.9768905334933308}}

### Сохранение модели и энкодера
Linear Regression показала лучший результат, поэтому используем её для создания финальной модели

In [23]:
import pickle

# Сохранение обученной модели
with open('linear_regression_model.pkl', 'wb') as model_file:
    pickle.dump(models["Linear Regression"], model_file)

# Сохранение энкодеров категориальных переменных
with open('label_encoders.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoders, encoder_file)

print("Модель и энкодер успешно сохранены!")


Модель и энкодер успешно сохранены!
