In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files

In [2]:
uploaded = files.upload()

Saving jamb_exam_results.csv to jamb_exam_results.csv


In [5]:
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name, sep=',')

In [16]:
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,15,Male,High,0,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,22,Female,Medium,Tertiary,1


In [17]:
df.shape

(5000, 16)

In [7]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

Подготовка датасета

In [22]:
# Удалите столбец student_id.
if 'student_id' in df.columns:
    df = df.drop('student_id', axis=1)

In [20]:
# Заполните пропущенные значения нулями.
df = df.fillna(0)

In [29]:
# Разделите данные на train/validation/test
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [18]:
print(f"Тренировочные: {df_train.shape}")
print(f"Валидационные: {df_val.shape}")
print(f"Тестовые: {df_test.shape}")

Тренировочные: (3000, 16)
Валидационные: (1000, 16)
Тестовые: (1000, 16)


In [30]:
from sklearn.feature_extraction import DictVectorizer
y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

# Удаляем целевую переменную из признаков
del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

# Преобразуем датафреймы
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.transform(df_val.to_dict(orient='records'))
X_test = dv.transform(df_test.to_dict(orient='records'))

In [32]:
# Вопрос 1: Дерево решений с max_depth=1
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

In [34]:
names = dv.get_feature_names_out()
tree_index = dt.tree_.feature[0]
splitting_feature = names[tree_index]

print(splitting_feature)

study_hours_per_week


In [36]:
# Вопрос 2: Случайный лес с n_estimators=10
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"{rmse:.2f}")

42.14


In [37]:
# Вопрос 3: Поиск оптимального n_estimators
n_estimators_range = range(10, 201, 10)
rmse_scores = []

In [39]:
for n in n_estimators_range:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

In [42]:
min_rmse = min(rmse_scores)
threshold = min_rmse + 0.001

for i, (n, rmse_val) in enumerate(zip(n_estimators_range, rmse_scores)):
    if rmse_val <= threshold:
        stabilization_point = n
        break

options = [10, 25, 80, 200]
closest_stabilization = min(options, key=lambda x: abs(x - stabilization_point))
print(closest_stabilization)

80


In [43]:
# Вопрос 4: Поиск оптимального max_depth
max_depth_values = [10, 15, 20, 25]
depth_rmse_means = []

In [44]:
for depth in max_depth_values:
    depth_rmses = []
    for n in n_estimators_range:
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        depth_rmses.append(rmse)

    mean_rmse = np.mean(depth_rmses)
    depth_rmse_means.append(mean_rmse)
    print(f"max_depth={depth}: средний RMSE={mean_rmse:.3f}")

best_depth_index = np.argmin(depth_rmse_means)
best_depth = max_depth_values[best_depth_index]
print(f"Лучшее значение max_depth: {best_depth}")

max_depth=10: средний RMSE=40.392
max_depth=15: средний RMSE=40.735
max_depth=20: средний RMSE=40.740
max_depth=25: средний RMSE=40.788
Лучшее значение max_depth: 10


In [45]:
# Вопрос 5: Важность признаков
rf_final = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_final.fit(X_train, y_train)

In [47]:
importances = rf_final.feature_importances_
importance_dict = dict(zip(names, importances))

sorted_features = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_features[:10]:
    print(f"{feature}: {importance:.4f}")

study_hours_per_week: 0.2484
attendance_rate: 0.1497
distance_to_school: 0.1365
teacher_quality: 0.0827
age: 0.0693
assignments_completed: 0.0315
socioeconomic_status=High: 0.0257
parent_involvement=High: 0.0229
it_knowledge=High: 0.0177
parent_education_level=Secondary: 0.0170


Ответы:
1) Какой признак используется для разбиения данных? -- study_hours_per_week
2) Какое значение RMSE у этой модели на валидационных данных? -- 42.14
3) После какого значения n_estimators RMSE перестает улучшаться? -- 80
4) Какое значение max_depth оказалось лучшим по среднему RMSE? -- 10
5) Какой признак оказался самым важным ? -- study_hours_per_week