### 0. Загрузка данных и подготовка датасета

In [11]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor  
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Загрузка данных
df = pd.read_csv('jamb_exam_results.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Удаляем ненужный столбец
df = df.drop(columns=['student_id'])

# Заполняем пропущенные значения нулями
df = df.fillna(0)

# Разделяем данные на train, validation и test
train_data, temp_data = train_test_split(df, test_size=0.4, random_state=1)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=1)

# Создаем DictVectorizer
dv = DictVectorizer(sparse=True)

# Преобразуем данные
X_train = dv.fit_transform(train_data.drop(columns=['jamb_score']).to_dict(orient='records'))
X_val = dv.transform(val_data.drop(columns=['jamb_score']).to_dict(orient='records'))
X_test = dv.transform(test_data.drop(columns=['jamb_score']).to_dict(orient='records'))

# Целевая переменная
y_train = train_data['jamb_score']
y_val = val_data['jamb_score']
y_test = test_data['jamb_score']

### 1.Вопрос 1: DecisionTreeRegressor:

In [3]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

# Определяем признак для разбиения
feature_index = dt.tree_.feature[0]
if feature_index != -2:
    feature = dv.feature_names_[feature_index]
    print("Признак для разбиения данных:", feature)
else:
    print("Дерево не выполняет разбиение.")

Признак для разбиения данных: study_hours_per_week


### 2.Вопрос 2: RandomForestRegressor

In [4]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
val_pred = rf.predict(X_val)
rmse_rf = np.sqrt(mean_squared_error(y_val, val_pred))
print("Вопрос 2: RMSE на валидации:", rmse_rf)

Вопрос 2: RMSE на валидации: 43.157758977963624


### 3.  Вопрос 3: Поиск лучшего n_estimators

In [5]:
results = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    val_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    results.append((n, rmse))

# Вывод всех результатов для проверки
for n, rmse in results:
    print(f"n_estimators: {n}, RMSE: {rmse:.3f}")

# Находим лучшее значение n_estimators
min_rmse = min(r[1] for r in results)
best_n = next(n for n, rmse in results if abs(rmse - min_rmse) < 1e-3)
print("\nВопрос 3: Лучшее значение n_estimators:", best_n)

n_estimators: 10, RMSE: 43.158
n_estimators: 20, RMSE: 41.790
n_estimators: 30, RMSE: 41.556
n_estimators: 40, RMSE: 41.076
n_estimators: 50, RMSE: 40.957
n_estimators: 60, RMSE: 40.774
n_estimators: 70, RMSE: 40.588
n_estimators: 80, RMSE: 40.503
n_estimators: 90, RMSE: 40.435
n_estimators: 100, RMSE: 40.365
n_estimators: 110, RMSE: 40.348
n_estimators: 120, RMSE: 40.302
n_estimators: 130, RMSE: 40.286
n_estimators: 140, RMSE: 40.263
n_estimators: 150, RMSE: 40.254
n_estimators: 160, RMSE: 40.200
n_estimators: 170, RMSE: 40.187
n_estimators: 180, RMSE: 40.136
n_estimators: 190, RMSE: 40.152
n_estimators: 200, RMSE: 40.138

Вопрос 3: Лучшее значение n_estimators: 180


### 4. Вопрос 4: Поиск лучшего max_depth

In [6]:
depths = [10, 15, 20, 25]
best_depth = None
best_rmse = float('inf')

for depth in depths:
    avg_rmse = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(max_depth=depth, n_estimators=n, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        val_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
        avg_rmse.append(rmse)
    mean_rmse = np.mean(avg_rmse)
    if mean_rmse < best_rmse:
        best_rmse = mean_rmse
        best_depth = depth

print("Вопрос 4: Лучшее значение max_depth:", best_depth)

Вопрос 4: Лучшее значение max_depth: 10


### 5.  Вопрос 5: Важность признаков

In [7]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_
most_important = dv.feature_names_[np.argmax(feature_importances)]
print("Вопрос 5: Самый важный признак:", most_important)

Вопрос 5: Самый важный признак: study_hours_per_week


### 6. Вопрос 6: XGBoost

In [12]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Модель с eta=0.3
model_1 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

# Модель с eta=0.1
xgb_params['eta'] = 0.1
model_2 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)

best_eta = 0.3 if model_1.best_score < model_2.best_score else 0.1
print("Вопрос 6: Лучший eta:", best_eta)

[0]	train-rmse:42.84835	eval-rmse:44.52338
[1]	train-rmse:39.96423	eval-rmse:42.83406
[2]	train-rmse:37.91231	eval-rmse:41.62607
[3]	train-rmse:36.51126	eval-rmse:41.25491
[4]	train-rmse:35.52212	eval-rmse:40.84075
[5]	train-rmse:34.77126	eval-rmse:40.71677
[6]	train-rmse:34.03898	eval-rmse:40.72669
[7]	train-rmse:33.62820	eval-rmse:40.68822
[8]	train-rmse:32.94729	eval-rmse:40.81273
[9]	train-rmse:32.27703	eval-rmse:40.84939
[10]	train-rmse:31.73818	eval-rmse:40.83759
[11]	train-rmse:31.31360	eval-rmse:40.80575
[12]	train-rmse:30.72949	eval-rmse:40.84238
[13]	train-rmse:30.11486	eval-rmse:40.96020
[14]	train-rmse:29.43538	eval-rmse:40.98775
[15]	train-rmse:29.23018	eval-rmse:41.04798
[16]	train-rmse:28.64113	eval-rmse:41.08375
[17]	train-rmse:28.42128	eval-rmse:41.15979
[0]	train-rmse:45.64414	eval-rmse:46.63724
[1]	train-rmse:44.26862	eval-rmse:45.58724
[2]	train-rmse:43.08569	eval-rmse:44.76209
[3]	train-rmse:42.05227	eval-rmse:44.02498
[4]	train-rmse:41.10533	eval-rmse:43.40640
[5]