In [None]:
import sys  # System-specific parameters and functions
import numpy as np  # Fundamental package for scientific computing with Python
import pandas as pd  # Powerful data structures for data manipulation and analysis
from datetime import datetime  # Basic date and time types
import warnings  # Warning control
warnings.filterwarnings('ignore')  # Ignore warnings

In [None]:
df = pd.read_csv('/kaggle/input/ml-data/annonimized.csv')
df = df[(df['is_final'] != 0) | (df['pre_score'] != 10000)]
df.info()

In [None]:
df = df.rename(columns={"concat('it001',`assignment_id`)": 'assignment_id'})
df = df.rename(columns={"concat('it001',`problem_id`)":'problem_id'})
df = df.rename(columns={"concat('it001', username)":'username'})

In [None]:
df = df.drop(["concat('it001',`language_id`)", 'updated_at'], axis=1)

In [None]:
def calculate_frequency_vector(student_df):
    a = pd.to_datetime(student_df['created_at'], format='%m-%d %H:%M:%S', errors='coerce').dt.hour
    a = a.dropna()  # Drop rows with NaT (errors in parsing)

    hour_counts = a.value_counts().sort_index()
    frequency_vector = np.zeros(24)
    for hour, count in hour_counts.items():
        hour = int(hour)
        frequency_vector[hour] = count
    return frequency_vector

# Dictionary to store frequency vectors for each student
frequency_vectors = {}

# Iterate over each unique student and calculate their frequency vector
for username in df["username"].unique():
    student_df = df[df["username"] == username]
    frequency_vectors[username] = calculate_frequency_vector(student_df)

In [None]:
frequency_df = pd.DataFrame.from_dict(frequency_vectors, orient='index', columns=[f'hour_{i}' for i in range(24)]).reset_index()
frequency_df.rename(columns={'index': "username"}, inplace=True)

In [None]:
df = df.merge(frequency_df, on="username")

In [None]:
fixed_year = 2024  # Chọn một năm bất kỳ
df['created_at'] = [f"{fixed_year}-{date}" for date in df['created_at']]

In [None]:
df['created_at'] = pd.to_datetime(df['created_at'])
def adjust_year(date):
    if date.month >= 9:
        return date.replace(year=date.year - 1)
    return date

# Áp dụng hàm này vào cột 'created_at'
df['created_at'] = df['created_at'].apply(adjust_year)

# Xử lý theo assignment id

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()

# Fit và transform cột assignment_id
df['assignment_id_encoded'] = label_encoder.fit_transform(df['assignment_id'])

# Hiển thị kết quả
print(df[['username', 'assignment_id', 'assignment_id_encoded']])

In [None]:
def calculate_count_assignment_vector(student_df):
    a = student_df.groupby('assignment_id_encoded').size()
    count_assignment_vector = np.zeros(203)
    for i, count in a.items():
        count_assignment_vector[i] = count
    return count_assignment_vector

In [None]:
def calculate_status_assignment_vector(student_df):
    status_counts = student_df.groupby('assignment_id_encoded')['status'].value_counts()
    a = status_counts[status_counts.index.get_level_values(1) != 'SCORE']
    status_counts_vector = np.zeros(203)
    for i, count in a.items():
        status_counts_vector[i[0]] = count
    return status_counts_vector

In [None]:
def calculate_count_problem_vector(student_df):
    a = student_df.groupby('assignment_id_encoded')['problem_id'].nunique()
    problem_counts_vector = np.zeros(203)
    for i, count in a.items():
        problem_counts_vector[i] = count
    return problem_counts_vector
    

In [None]:
# Dictionary to store frequency vectors for each student
count_assignment_vector = {}
status_counts_vector = {}
problem_counts_vector = {}


In [None]:
for username in df["username"].unique():
    student_df = df[df["username"] == username]
    count_assignment_vector[username] = calculate_count_assignment_vector(student_df)
    status_counts_vector[username] = calculate_status_assignment_vector(student_df)
    problem_counts_vector[username] = calculate_count_problem_vector(student_df)


In [None]:
count_assignment_df = pd.DataFrame.from_dict(count_assignment_vector, orient='index', columns=[f'count_assignment_vector{i}' for i in range(203)]).reset_index()
count_assignment_df.rename(columns={'index': "username"}, inplace=True)

status_counts_df = pd.DataFrame.from_dict(status_counts_vector, orient='index', columns=[f'status_counts_vector{i}' for i in range(203)]).reset_index()
status_counts_df.rename(columns={'index': "username"}, inplace=True)

problem_counts_df = pd.DataFrame.from_dict(problem_counts_vector, orient='index', columns=[f'problem_counts_vector{i}' for i in range(203)]).reset_index()
problem_counts_df.rename(columns={'index': "username"}, inplace=True)


In [None]:

df = df.merge(count_assignment_df, on="username")

df = df.merge(status_counts_df, on="username")

df = df.merge(problem_counts_df, on="username")


# Xử lý theo problem id

In [None]:
label_encoder1 = LabelEncoder()

# Fit và transform cột assignment_id
df['problem_id_encoded'] = label_encoder1.fit_transform(df['problem_id'])

# Hiển thị kết quả
print(df[['username', 'assignment_id', 'problem_id_encoded']])

In [None]:
def calculate_problem_vector(student_df):
    a = student_df['problem_id_encoded'].unique()
    problem_vector = np.zeros(468)
    for value in a:
        problem_vector[value] = 1
    return problem_vector

In [None]:
def calculate_count_problem_vector(student_df):
    a = student_df.groupby('problem_id_encoded').size()
    count_problem_vector = np.zeros(468)
    for i, count in a.items():
        count_problem_vector[i] = count
    return count_problem_vector

In [None]:
def calculate_time_problem_vector(student_df):
    a = student_df.groupby('problem_id_encoded')['created_at'].agg(lambda x: (x.max() - x.min()).total_seconds() // 3600)
    time_problem_vector = np.zeros(468)
    for i, count in a.items():
        time_problem_vector[i] = count
    return time_problem_vector

In [None]:
def calculate_count_problem_0_vector(student_df):
    count_problem_0 = student_df.groupby('problem_id_encoded')['is_final'].value_counts()
    a = count_problem_0[count_problem_0.index.get_level_values(1) == 0]
    count_problem_0_vector = np.zeros(468)
    for i, count in a.items():
        count_problem_0_vector[i[0]] = count
    return count_problem_0_vector

In [None]:
def calculate_mean_prescrore_problem_vector(student_df):
    a = student_df.groupby('problem_id_encoded')['pre_score'].mean()
    mean_prescrore_problem_vector = np.zeros(468)
    for i, count in a.items():
        if count > 0:
            mean_prescrore_problem_vector[i] = np.log(count)
        else:
            mean_prescrore_problem_vector[i] = 0
    return mean_prescrore_problem_vector

In [None]:
problem_vector = {}
count_problem_vector = {}
time_problem_vector = {}
count_problem_0_vector = {}
mean_prescrore_problem_vector = {}


In [None]:
for username in df["username"].unique():
    student_df = df[df["username"] == username]
    problem_vector[username] = calculate_problem_vector(student_df)
    count_problem_vector[username] = calculate_count_problem_vector(student_df)
    time_problem_vector[username] = calculate_time_problem_vector(student_df)
    count_problem_0_vector[username] = calculate_count_problem_0_vector(student_df)
    mean_prescrore_problem_vector[username] = calculate_mean_prescrore_problem_vector(student_df)

In [None]:
problem_df = pd.DataFrame.from_dict(problem_vector, orient='index', columns=[f'problem_vector{i}' for i in range(468)]).reset_index()
problem_df.rename(columns={'index': "username"}, inplace=True)

count_problem_df = pd.DataFrame.from_dict(count_problem_vector, orient='index', columns=[f'count_problem_vector{i}' for i in range(468)]).reset_index()
count_problem_df.rename(columns={'index': "username"}, inplace=True)

time_problem_df = pd.DataFrame.from_dict(time_problem_vector, orient='index', columns=[f'time_problem_vector{i}' for i in range(468)]).reset_index()
time_problem_df.rename(columns={'index': "username"}, inplace=True)

count_problem_0_df = pd.DataFrame.from_dict(count_problem_0_vector, orient='index', columns=[f'count_problem_0_vector{i}' for i in range(468)]).reset_index()
count_problem_0_df.rename(columns={'index': "username"}, inplace=True)

mean_prescrore_problem_df = pd.DataFrame.from_dict(mean_prescrore_problem_vector, orient='index', columns=[f'mean_prescrore_problem_vector{i}' for i in range(468)]).reset_index()
mean_prescrore_problem_df.rename(columns={'index': "username"}, inplace=True)

In [None]:
df = df.merge(problem_df, on="username")

df = df.merge(count_problem_df, on="username")

df = df.merge(time_problem_df, on="username")

df = df.merge(count_problem_0_df, on="username")

df = df.merge(mean_prescrore_problem_df, on="username")

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df = reduce_mem_usage(df)

In [None]:
!pip install lightgbm catboost

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error as MSE
import lightgbm as lgb
import optuna
from sklearn.model_selection import cross_validate
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.ensemble import VotingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer

In [None]:
train_term = df.drop(['assignment_id','problem_id','is_final','status','pre_score','coefficient','created_at','judgement'], axis = 1)

In [None]:
test_term = df.drop(['assignment_id','problem_id','is_final','status','pre_score','coefficient','created_at','judgement'], axis = 1)

In [None]:
test_term.drop_duplicates(subset='username', keep='first', inplace=True)
print(f"Number of rows: {len(test_term)}")

In [None]:
qt_train = pd.read_csv('/kaggle/input/ml-data/qt-public.csv')
th_train = pd.read_csv('/kaggle/input/ml-data/th-public.csv')
ck_train = pd.read_csv('/kaggle/input/ml-data/ck-public.csv')
tbtl_train = pd.read_csv('/kaggle/input/ml-data/tbtl-public.csv')

qt_train = qt_train.rename(columns={"diemqt": "QT"})

In [None]:
qt_train.dropna(inplace=True)
th_train.dropna(inplace=True)
ck_train.dropna(inplace=True)
tbtl_train.dropna(inplace=True)

In [None]:
target_data = {
    "QT": qt_train,
    "TH": th_train,
    "CK": ck_train,
    "TBTL": tbtl_train,
}

In [None]:
processed_data = {}
for target, data in target_data.items():
    merged_data = train_term.merge(data, on='username', how='inner')
    merged_data.drop_duplicates(subset='username', keep='first', inplace=True)
    
    if merged_data[target].dtype == 'object':
        merged_data[target] = merged_data[target].str.replace('\xa0', ' ', regex=True)
        merged_data[target] = merged_data[target].apply(lambda x: np.nan if x == ' ' else x).astype(float)
    else:
        merged_data[target] = merged_data[target].apply(lambda x: np.nan if x == ' ' else x).astype(float)
    
    merged_data.dropna(subset=[target], inplace=True)
    
    processed_data[target] = merged_data

In [None]:
X_train_y_dict = {}

for target, term_data in processed_data.items():
    X_train = term_data.drop(columns=[target, "username"])
    y = term_data[target].values  # Đảm bảo `y` là mảng NumPy
    
    X_train_y_dict[target] = (X_train, y)

X_train_QT, y_QT = X_train_y_dict["QT"]
X_train_TH, y_TH = X_train_y_dict["TH"]
X_train_CK, y_CK = X_train_y_dict["CK"]
X_train_TBTL, y_TBTL = X_train_y_dict["TBTL"]

In [None]:
X_pca_dict = {}
y_dict = {}

for target, term_data in processed_data.items():
    X_train = term_data.drop(columns=[target, "username"])
    y = term_data[target].values  # Sử dụng `.values` để đảm bảo dữ liệu dưới dạng mảng NumPy
    
    X_pca_dict[target] = np.asarray(X_train)
    y_dict[target] = np.asarray(y)

X_pca_QT, y_QT = X_pca_dict["QT"], y_dict["QT"]
X_pca_TH, y_TH = X_pca_dict["TH"], y_dict["TH"]
X_pca_CK, y_CK = X_pca_dict["CK"], y_dict["CK"]
X_pca_TBTL, y_TBTL = X_pca_dict["TBTL"], y_dict["TBTL"]

In [None]:
def objective_lgb(trial, target):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "n_estimators": 1000,
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100)
    }

    # Lấy dữ liệu train cho target từ processed_data
    term_data = processed_data[target]
    X_train = term_data.drop(columns=[target, "username"]).values  # Xóa cột target và username
    y = term_data[target].values

    # Cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    r2_list = []

    for train_index, test_index in cv.split(X_train, y):
        X_train_fold, X_test = X_train[train_index], X_train[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = lgb.LGBMRegressor(**params)
        model.fit(X_train_fold, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        r2_list.append(r2)

    return np.mean(r2_list)

# Huấn luyện tuần tự cho từng target
targets = list(processed_data.keys())  # ["QT", "TH", "CK", "TBTL"]
for target in targets:
    print(f"Optimizing for {target}...")
    
    # Tạo một thử nghiệm (study) mới cho mỗi mục tiêu
    study_lgb = optuna.create_study(direction="maximize")  # Tối ưu hóa R2 score (maximize)
    
    # Tối ưu hóa tham số cho mô hình của mỗi target
    study_lgb.optimize(lambda trial: objective_lgb(trial, target), n_trials=30)
    
    # In ra tham số tối ưu cho mỗi mục tiêu
    print(f"Best parameters for {target}: {study_lgb.best_params}")
    print(f"Best R2 score for {target}: {study_lgb.best_value}")


In [None]:
def objective_cat(trial, target):
    # Cập nhật tham số cho model
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 3, 8),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'eval_metric': 'RMSE',
        'random_seed': 42,
        'verbose': False,
        'loss_function': 'RMSE'
    }

    # Lấy dữ liệu train cho target từ processed_data
    term_data = processed_data[target]
    X_train = term_data.drop(columns=[target, "username"]).values  # Xóa cột target và username
    y = term_data[target].values

    # Cross-validation
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    r2_list = []

    for train_index, test_index in cv.split(X_train, y):
        X_train_fold, X_test = X_train[train_index], X_train[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = CatBoostRegressor(**params)
        model.fit(X_train_fold, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        r2_list.append(r2)

    return np.mean(r2_list)

# Huấn luyện tuần tự cho từng target
targets = list(processed_data.keys())  # ["QT", "TH", "CK", "TBTL"]
for target in targets:
    print(f"Optimizing for {target}...")

    # Tạo một thử nghiệm (study) mới cho mỗi mục tiêu
    study_cat = optuna.create_study(direction='maximize')  # Tối ưu hóa R2 score (maximize)
    
    # Tối ưu hóa tham số cho mô hình của mỗi target
    study_cat.optimize(lambda trial: objective_cat(trial, target), n_trials=30)
    
    # In ra tham số tối ưu cho mỗi mục tiêu
    print(f"Best parameters for {target}: {study_cat.best_params}")
    print(f"Best R2 score for {target}: {study_cat.best_value}")


In [None]:
voting_r2_dict = {}

# Huấn luyện mô hình cho từng target
for target in targets:
    print(f"Training Voting Regressor for {target}...")

    # Lấy các tham số tối ưu cho mỗi target
    lgb_best_params = study_lgb.best_params  # Đảm bảo các tham số từ LightGBM được lưu
    cat_best_params = study_cat.best_params  # Đảm bảo các tham số từ CatBoost được lưu

    # Lấy dữ liệu train cho target từ processed_data
    term_data = processed_data[target]
    X_train = term_data.drop(columns=[target, "username"]).values  # Loại bỏ cột target và username
    y = term_data[target].values

    # Cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    voting_r2 = []

    for train_index, test_index in kf.split(X_train):
        X_train_fold, X_test = X_train[train_index], X_train[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Khởi tạo mô hình LightGBM và CatBoost
        lgb_model = lgb.LGBMRegressor(**lgb_best_params)
        cat_model = CatBoostRegressor(**cat_best_params)

        # Khởi tạo Voting Regressor với các mô hình
        voting_model = VotingRegressor(estimators=[
            ('lgb', lgb_model),
            ('cat', cat_model)
        ])

        # Huấn luyện các mô hình
        cat_model.fit(X_train_fold, y_train)
        lgb_model.fit(X_train_fold, y_train)
        voting_model.fit(X_train_fold, y_train)

        # Dự đoán và tính R^2
        y_pred = voting_model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        voting_r2.append(r2)

    # Lưu trữ kết quả R^2 của Voting Regressor cho target
    voting_r2_dict[target] = {
        'mean_r2': np.mean(voting_r2),
        'std_r2': np.std(voting_r2)
    }

    print(f'Voting Regressor R^2 for {target}: {voting_r2_dict[target]["mean_r2"]:.4f} ± {voting_r2_dict[target]["std_r2"]:.4f}')


# Test

In [None]:
for target in ['QT', 'TH', 'CK', 'TBTL']:
    print(f"Processing target: {target}")
    
    # Lấy dữ liệu đã xử lý từ processed_data cho target
    processed_train_data = processed_data[target]
    
    # Tìm các cột chung giữa processed_train_data và test_term
    common_cols = processed_train_data.columns.intersection(test_term.columns)

    # Tạo DataFrame mới chỉ chứa các cột chung
    train_term_common = processed_train_data[common_cols]
    test_term_common = test_term[common_cols]
    
    # Tìm các hàng khác nhau
    different_rows = pd.concat([train_term_common, test_term_common]).drop_duplicates(keep=False)
    different_rows = different_rows.drop_duplicates(subset=['username'], keep='first')
    different_rows.reset_index(drop=True, inplace=True)
    
    # In ra số lượng các hàng khác nhau
    print(f"Number of rows in different_rows: {len(different_rows)}")
    
    # Hiển thị 10 dòng đầu tiên của các hàng khác nhau
    print(different_rows.head(10))  # Hiển thị 10 dòng đầu tiên


In [None]:
for target in targets:
    X_test = different_rows.drop(columns=["username"])
    

In [None]:
for target in targets:
    X_pca = np.asarray(X_test)


In [None]:
for target in targets:
    y_pre = voting_model.predict(X_pca)

In [None]:
for target in targets:
    username = different_rows['username'].to_list()
    results = [(username[i], value) for i, value in enumerate(y_pre)]
    df = pd.DataFrame(results, columns=['file_name', 'label'])
    df.to_csv(f'output_{target}.csv', index=False, header=False)
