<a href="https://colab.research.google.com/github/Toan02Ky-UIT/CodeProject/blob/main/model_chinh_sua.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
df = pd.read_csv('/gdrive/MyDrive/Project/Bai2/annonimized.csv')
diem_ck = pd.read_csv('/gdrive/MyDrive/Project/Bai3/ck-public.csv')


In [None]:
df = df.rename(columns={
    "concat('it001',`assignment_id`)": "assignment_id",
    "concat('it001',`problem_id`)": "problem_id",
    "concat('it001', username)": "username",
    "concat('it001',`language_id`)": "language_id",
})


In [None]:
df['created_at'] = df['created_at'].astype(str)

df['day_month'] = df['created_at'].str.extract(r'(\d{2}-\d{2})')[0]

def day_of_year(day_month):
    try:
        if pd.isna(day_month):
            return None
        month, day = map(int, day_month.split('-'))
        days_in_months = [0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334]
        return days_in_months[month - 1] + day
    except:
        return None


df['day_in_year'] = df['day_month'].apply(day_of_year)


In [None]:
import json

def safe_parse_judgement(j):
    try:
        if pd.isna(j):
            return {}
        if isinstance(j, dict):
            return j
        # Attempt to load JSON, handling potential errors
        parsed_judgement = json.loads(j)
        # Check if the parsed result is a dictionary
        if isinstance(parsed_judgement, dict):
            return parsed_judgement
        else:
            # If not a dictionary, return an empty dictionary
            return {}
    except:
        # Return an empty dictionary for any parsing errors
        return {}

df['judgement_parsed'] = df['judgement'].apply(safe_parse_judgement)

# Lấy các feature từ judgement
df['has_fatal_error'] = df['judgement_parsed'].apply(
    lambda x: any('fatal error' in k.lower() for k in x.get('verdicts', {}) if isinstance(x.get('verdicts', {}), dict) and isinstance(k, str))
)

df['verdict_WRONG_count'] = df['judgement_parsed'].apply(
    lambda x: x.get('verdicts', {}).get('WRONG', 0) if isinstance(x.get('verdicts', {}), dict) else 0
)

df['time_limit_exceeded_count'] = df['judgement_parsed'].apply(
    lambda x: x.get('verdicts', {}).get('Time Limit Exceeded', 0) if isinstance(x.get('verdicts', {}), dict) else 0
)

In [None]:
df['final_score'] = df['pre_score'] * df['coefficient'] / 10000

# Encode language
le = LabelEncoder()
df['language_id_encoded'] = le.fit_transform(df['language_id'])


In [None]:
feature_df = df.groupby('username').agg(
    total_submissions=('problem_id', 'count'),
    final_submissions=('is_final', 'sum'),
    distinct_problems=('problem_id', 'nunique'),
    max_score_problems=('final_score', lambda x: (x == 10).sum()),
    mean_final_score=('final_score', 'mean'),
    mean_coefficient=('coefficient', 'mean'),
    compilation_error_rate=('status', lambda x: (x == 'Compilation Error').mean()),
    syntax_error_rate=('status', lambda x: (x == 'Syntax Error').mean()),
    pending_rate=('status', lambda x: (x == 'Pending').mean()),
    has_fatal_error=('has_fatal_error', 'max'),
    verdict_WRONG_count=('verdict_WRONG_count', 'sum'),
    time_limit_exceeded_count=('time_limit_exceeded_count', 'sum'),
    used_multiple_languages=('language_id', 'nunique'),
    dominant_language=('language_id_encoded', lambda x: x.mode().iloc[0] if not x.mode().empty else -1),
    mean_day=('day_in_year', 'mean'),
    std_day=('day_in_year', 'std')
).reset_index()

# Thêm active days
first_last_time = df.groupby('username').agg(
    first_day=('day_in_year', 'min'),
    last_day=('day_in_year', 'max')
)
first_last_time['active_days'] = first_last_time['last_day'] - first_last_time['first_day']
first_last_time['active_days'] = first_last_time['active_days'].apply(lambda x: x + 365 if x < 0 else x)

feature_df = feature_df.merge(first_last_time[['active_days']], on='username', how='left')

In [None]:
# Unique ngày hoạt động
active_day_count = df.groupby('username')['day_in_year'].nunique().rename('unique_active_days')
feature_df = feature_df.merge(active_day_count, on='username', how='left')

# Thời điểm nộp bài trung bình normalized
feature_df['mean_day_normalized'] = feature_df['mean_day'] / 365

# Tỉ lệ nộp bài muộn (sau ngày thứ 270)
late_submissions = df[df['day_in_year'] > 270].groupby('username').size().rename('late_submissions')
feature_df = feature_df.merge(late_submissions, on='username', how='left')
feature_df['late_submissions'] = feature_df['late_submissions'].fillna(0)
feature_df['late_submission_rate'] = feature_df['late_submissions'] / feature_df['total_submissions']

# Số bài final nhưng không đạt điểm tối đa
final_below_max = df[(df['is_final'] == 1) & (df['final_score'] < 10)].groupby('username').size().rename('final_below_max_count')
feature_df = feature_df.merge(final_below_max, on='username', how='left')
feature_df['final_below_max_count'] = feature_df['final_below_max_count'].fillna(0)

# Tỉ lệ bài đạt điểm tối đa
feature_df['max_score_rate'] = feature_df['max_score_problems'] / feature_df['distinct_problems']

# Có dùng nhiều hơn 1 ngôn ngữ không
feature_df['uses_multiple_languages_flag'] = (feature_df['used_multiple_languages'] > 1).astype(int)

# Tổng số lỗi
feature_df['total_errors'] = (
    feature_df['compilation_error_rate'] * feature_df['total_submissions'] +
    feature_df['syntax_error_rate'] * feature_df['total_submissions'] +
    feature_df['verdict_WRONG_count'] +
    feature_df['time_limit_exceeded_count'] +
    feature_df['has_fatal_error']
)

# Tỉ lệ lỗi
feature_df['error_rate'] = feature_df['total_errors'] / feature_df['total_submissions']

# Trung bình số lần nộp mỗi bài
feature_df['submissions_per_problem'] = feature_df['total_submissions'] / feature_df['distinct_problems']


In [None]:
feature_df = feature_df.merge(diem_ck.rename(columns={'hash': 'username', 'CK': 'target'}), on='username', how='left')

feature_df['target'] = feature_df['target'].astype(str).str.replace('\xa0', '', regex=True).str.strip()
feature_df['target'] = pd.to_numeric(feature_df['target'], errors='coerce')

# Train test split
train_df = feature_df[feature_df['target'].notnull()]
test_df = feature_df[feature_df['target'].isnull()]

X = train_df.drop(columns=['username', 'target'])
y = train_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestRegressor(random_state=42, max_depth=10, min_samples_split=13, n_estimators=91)
model.fit(X_train, y_train)

# B13. Đánh giá
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'R² trên tập test: {r2:.4f}')


R² trên tập test: 0.2601


In [None]:
cat_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    verbose=0,
    random_seed=42
)

cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)
r2_cat = r2_score(y_test, y_pred_cat)
print(f'R² CatBoost: {r2_cat:.4f}')


R² CatBoost: 0.2761


In [None]:
avg_pred = (y_pred + y_pred_cat) / 2

r2 = r2_score(y_test, avg_pred)
print(f"R² của Averaging: {r2:.4f}")

R² của Averaging: 0.2874
