# PISA of UIRT

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data preprocessing

In [None]:
raw_df = pd.read_spss("CY07_MSU_STU_COG_testlet.sav")

In [None]:
# All the Process was Carried Out With Pandas

fil1 = raw_df.iloc[:, 13:65]      # Clear All the Information Except Responses

fil2 = fil1.replace(['Full credit', '1 - Full credit', '2 - Full credit', 'No credit', '0 - No credit'], [1, 1, 1, 0, 0])
# Invert All Responses in terms of Binary Codes(1: Correct, 0: Incorrect)

fil3 = fil2.drop('CM955Q03S', axis=1)    # Clear the Item of Multiple Choices
fil4 = fil3.dropna(how='all')            # Clear All Students of No Responses & Clear All Items of No Responses

In [None]:
# DataFrame to Numpy
num_np = fil4.to_numpy()

# The Conversion Process to avoid 'divided by zero' error
scarub_np = np.where(num_np == 1, 0.99, num_np)
scourge_np = np.where(scarub_np == 0, 0.01, scarub_np)
num_df = scourge_np                                     # Caution!! num_df is of numpy, not of pandas!!

# print(num_df)

num_dfdf = pd.DataFrame(num_df)                         # num_dfdf is, at last, of pandas!
p_solves = num_dfdf.notnull().sum(1)                    # count the number of responses regardless of NaN

# Data shape
rows, columns = num_df.shape
# print(rows, columns)


## Selection of Responses for the Test Set

In [None]:
from collections import Counter
import random
import math

### Functions of Requirement

In [None]:
def simple_random(num_residues, num_division):       # Number Distribution in Random
    
    result = []
    count = 0
    
    for i in range(num_division):
        if count < num_residues:
            result.append(1)
        else:
            result.append(0)
        count += 1
        
    random.shuffle(result)
    result_np = np.array(result)
        
    return result_np        # return is yielded in numpy form

In [None]:
def random_colrow_extractor(df_bf_gagong, df_pray_gagong, rate_sam):      # df_pray_gagong is of pandas, list_cols is of list.
    
    cols_num_samp = []              # the number of samples for each item
    coord_list = []
    ind_n = 0
    
    df_decay_train = df_bf_gagong.drop(['NS'], axis=1)
    df_decay = df_pray_gagong.drop(['NS'], axis=1)
    list_cols = basket_column.copy()
    
    row_min = df_decay.shape[0]
    col_min = df_decay.shape[1]
    
    num_sam = math.trunc(tot_num_ref * rate_sam)     # tot_num_ref is universal variable.
    
    # To distribute samples for each item
    how_quotient = num_sam // col_min
    how_residue = num_sam % col_min
    
    num_dist_col = simple_random(how_residue, col_min) + how_quotient
    num_dist_rsh = num_dist_col.reshape(1,col_min)
    num_dist_col_pd = pd.DataFrame(num_dist_rsh)
    num_dist_col_pd.columns = list_cols[:51]
    
    # To distribute samples for each examinee
    how_quotient_mu = num_sam // row_min
    how_residue_mu = num_sam % row_min
    
    num_dist_row = simple_random(how_residue_mu, row_min) + how_quotient_mu
    num_dist_rshr = num_dist_row.reshape(row_min,1)
    num_dist_row_pd = pd.DataFrame(num_dist_rshr, index=df_decay.index.tolist())
    
    # data for test set
    data_collect = []
    coord_col = []
    coord_row = []
    row_col_val = []
    
    # result for test set
    basket_trial_np = np.zeros((rows,columns))
    basket_trial_nan = np.where(basket_trial_np == np.nan, basket_trial_np, np.nan)
    #print(basket_trial_nan.shape)
    basket_test = pd.DataFrame(basket_trial_nan)
    basket_test.columns = list_cols[:51]
    #print(basket_test)
    
    # shuffle examinee's index
    shf_index = df_decay.index.tolist().copy()
    random.shuffle(shf_index)
    
    for mu in shf_index:

        col_decay = list_cols[:51].copy()
        
        for j in list_cols[:51]:
            if np.isnan(df_decay.loc[mu][j]):
                col_decay.remove(j)
            elif num_dist_col_pd.loc[0][j] == 0:
                col_decay.remove(j)
        
        col_decay_len = len(col_decay)
        num_col_pick = num_dist_row_pd.loc[mu][0]
        picked = simple_random(num_col_pick, col_decay_len)
        picked_np = np.array(picked)
        loc_picked = np.where(picked_np == 1)[0]
        
        for nm in loc_picked:
            col_picked = col_decay[nm]
            coord_col.append(col_picked)
            coord_row.append(mu)
            row_col_val.append(df_decay.loc[mu][col_picked])
            num_dist_col_pd.loc[0][col_picked] -= 1
            df_decay_train.loc[mu][col_picked] = np.nan
            
            basket_test.loc[mu][col_picked] = df_decay.loc[mu][col_picked]
            
            
    data_collect.append(coord_row)
    data_collect.append(coord_col)
    data_collect.append(row_col_val)
    data_collect_np = np.array(data_collect)
    
    return df_decay_train, basket_test, data_collect_np      # processed train set, test set and the set of coordinates of test set

In [None]:
# sampling responses to test set

basket_ini = pd.concat([num_dfdf, p_solves], axis=1)   # nametagging of num_dfdf

num_dfdf_stunt = num_dfdf.copy()                       # num_dfdf's understudent
num_dfdf_stunt.columns = fil4.columns.to_list()

basket_column = fil4.columns.to_list()
basket_column.append('NS')                     # NS stands for 'N'umber of the 'S'olved problems

basket_ini.columns = basket_column

gagong_univ1 = basket_ini.copy()
#gagong_univ21 = gagong_univ1[gagong_univ1['NS'] >= 3]
#gagong_univ31 = gagong_univ21.notnull().sum()

less_2 = []

for i in range(rows):
    if basket_ini['NS'][i] <= 15:
        less_2.append(i)

print(less_2)
basket_sel = basket_ini.copy()
basket_sel.drop(less_2, axis=0, inplace=True)

tot_num_ref = int(gagong_univ1.sum()[-1])

In [None]:
train_gagongs = []
test_gagongs = []
#num_iter = 1           # At first, beta test
num_iter = 10

for i in range(num_iter):

    num_df_gagong, test_set_gagong, test_set_coord = random_colrow_extractor(basket_ini, basket_sel, 0.1)
    # 'Gumeong' mean 'a hole' in Korean.
        
    train_gagongs.append(num_df_gagong)
    test_gagongs.append(test_set_gagong)
    test_coord_pd = pd.DataFrame(test_set_coord)
    test_coord_pd.to_csv("UIRT_0.1testset_{0}_0701.csv".format(i+1))


In [None]:
num_sav1 = 0

for i in range(num_iter):
    train_vect = train_gagongs[i].copy()
    test_vect = test_gagongs[i].copy()
    
    train_vect_pd = pd.DataFrame(train_vect)
    test_vect_pd = pd.DataFrame(test_vect)
    
    train_vect_pd.to_csv("share_traindf0.1_{0}_1518_0701.csv".format(i+1))
    test_vect_pd.to_csv("share_testdf0.1_1518_{0}_0701.csv".format(i+1))

In [None]:
print(test_gagongs[3].notnull().sum(axis=0))

## List of Functions of Requirement

In [None]:
# This function is for the comparison with the reference data.
# All the parameters are given in numpy form.
def expect_model(alpha_let, beta_let, theta_let, train_gagong_let):
    
    gagong_let = train_gagong_let.to_numpy()
    exponet_neg = alpha_let * (beta_let - theta_let)
    before_nan = 1/ (1 + np.exp(exponet_neg))
    after_nan = before_nan.copy()
    
    # Reflection of NaN data
    for n in range(before_nan.shape[0]):
        for m in range(before_nan.shape[1]):
            if np.isnan(gagong_let[n][m]):
                after_nan[n][m] = np.nan
                
    # The Conversion Process to avoid 'divided by zero' error
    scarub_e = np.where(after_nan >= 1, 0.99, after_nan)
    scourge_e = np.where(scarub_e <= 0, 0.01, scarub_e)
    result_e = scourge_e
    
    return result_e                  # The result is yielded in numpy form.


In [None]:
# Common parts of chain rule of D_KL's derivative.
# Only the train_gagong_let is given in pandas.
def preprocess_diff(alpha_let, beta_let, theta_let, train_gagong_let):

    gagong_let = train_gagong_let.to_numpy()
    p_imu = expect_model(alpha_let, beta_let, theta_let, train_gagong_let)    # from the model
    q_imu = gagong_let.copy()                                   # from the reference data

    # 바로 p와 q 조합
    KLD_common = p_imu - q_imu
    
    return KLD_common

In [None]:
# the function to update alpha
# Only the train_gagong_let is given in pandas.
def set_alpha(alpha_let, beta_let, theta_let, train_gagong_let):
    
    # Loading the common part
    expo = theta_let - beta_let                                         # exponential term
    common_unit = preprocess_diff(alpha_let, beta_let, theta_let, train_gagong_let)
    
    # Calculation Start
    delta_matrix = expo * common_unit                                   # before summation (formed in numpy)
    
    # Get rid of Missing Data
    dmatrix_df = pd.DataFrame(delta_matrix)
    dmatrix_fna = dmatrix_df.fillna(0)
    delta_matrix2 = dmatrix_fna.to_numpy()
    
    delta_alphak = delta_matrix2.sum(axis=0, keepdims = True)          # summation in terms of examinees
    
    alpha_med = alpha_let - A * delta_alphak                    # alpha update by means of Gradient Descent
    alpha_result = alpha_med
    
    return alpha_result

In [None]:
# the function to update beta
# Only the train_gagong_let is given in pandas.
def set_beta(alpha_let, beta_let, theta_let, train_gagong_let):
    
    # Loading the common part
    common_unit = preprocess_diff(alpha_let, beta_let, theta_let, train_gagong_let)    
    
    # Calculation Start
    delta_matrix = (-1) * alpha_let * common_unit            # before summation (formed in numpy)
    
    # Get rid of Missing Data
    dmatrix_df = pd.DataFrame(delta_matrix)   
    dmatrix_fna = dmatrix_df.fillna(0)        
    delta_matrix2 = dmatrix_fna.to_numpy()
    
    delta_betak = delta_matrix2.sum(axis=0, keepdims = True)       # summation in terms of examinees
    
    beta_med = beta_let - A * delta_betak                          # beta update by means of Gradient Descent
    beta_result = beta_med - np.mean(beta_med)                     # standardization of beta

    return beta_result                                            # The result is yielded in numpy form.

In [None]:
# the function to update theta
# Only the train_gagong_let is given in pandas.
def set_theta(alpha_let, beta_let, theta_let, train_gagong_let):
    
    # Loading the common part
    common_unit = preprocess_diff(alpha_let, beta_let, theta_let, train_gagong_let)    
    
    # Calculation Start
    delta_matrix = alpha_let * common_unit            # before summation (formed in numpy)
    
    # Get rid of Missing Data
    dmatrix_df = pd.DataFrame(delta_matrix)
    dmatrix_fna = dmatrix_df.fillna(0)
    delta_matrix2 = dmatrix_fna.to_numpy()
    
    delta_thetak = delta_matrix2.sum(axis=1, keepdims = True)   # summation in terms of items
        
    theta_result = theta_let - A * delta_thetak                 # theta update by means of Gradient Descent

    return theta_result                                        # The result is yielded in numpy form.

In [None]:
# the function to calculate KLD
# Train_gagong_let is of pandas the others are of numpy.
def set_D_KL(alpha_let, beta_let, theta_let, train_gagong_let):
    
    num_gagong = train_gagong_let.to_numpy()
    P_imu = expect_model(alpha_let, beta_let, theta_let, train_gagong_let)
    Q_imu = num_gagong.copy()
    
    KLD_imu_np = Q_imu * np.log((Q_imu) / (P_imu)) + (1 - Q_imu) * np.log((1 - Q_imu)/(1 - P_imu))
    
    # Get rid of missing data
    KLD_imu_df = pd.DataFrame(KLD_imu_np)
    KLD_shuttle = KLD_imu_df.fillna(0)
    KLD_imu = KLD_shuttle.to_numpy()
    
    D_KL_mu = KLD_imu.sum(axis=1)
    D_KL = D_KL_mu.sum(axis=0)
    
    return D_KL

In [None]:
# Iteration function for model optimization

def opt_model(alpha_let, beta_let, theta_let, train_gagong_let, test_gagong_let, num_iter):
    
    # Initialization of parameters and variables
    alpha_test = alpha_let.copy()
    beta_test = beta_let.copy()
    theta_test = theta_let.copy()
    
    KLD_train = set_D_KL(alpha_test, beta_test, theta_test, train_gagong_let)
    KLD_Trains = []
    KLD_Trains.append(KLD_train)
    
    KLD_testset = set_D_KL(alpha_test, beta_test, theta_test, test_gagong_let)
    KLD_Tests = []
    KLD_Tests.append(KLD_testset)
    
    for k in tqdm(range(num_iter)):
        # alpha update
        alpha_carrier = set_alpha(alpha_test, beta_test, theta_test, train_gagong_let)
        alpha_test = alpha_carrier

        # beta update
        beta_carrier = set_beta(alpha_test, beta_test, theta_test, train_gagong_let)
        beta_test = beta_carrier

        # theta update
        theta_carrier = set_theta(alpha_test, beta_test, theta_test, train_gagong_let)
        theta_test = theta_carrier

        # calculation of Kullback-Leibler Divergence
        KLD_carrier = set_D_KL(alpha_test, beta_test, theta_test, train_gagong_let)
        KLD_testset = set_D_KL(alpha_test, beta_test, theta_test, test_gagong_let)
        
        # Determination whether the iteration keeps or not
        if (k < num_iter - 1) and (KLD_carrier < KLD_train):
            KLD_train = KLD_carrier
            KLD_Trains.append(KLD_train)        # store KLD of trian set
            KLD_Tests.append(KLD_testset)       # store KLD of test set
        else:
            print("Final Kullback-Leibler Divergence: ", KLD_train)
            break
    
    return alpha_test, beta_test, theta_test, KLD_Trains, KLD_Tests

## Now, it is very time to play the real game!

In [None]:
from tqdm import tqdm

### real training

In [None]:
albetheKLD = []
num_iter = 0

for gagong_carrier in train_gagongs:
    
    carrier_shell = []
    
    p_df = gagong_carrier.copy()
    num_np = p_df.to_numpy()

    # theta initialization
    row_pre = p_df.mean(axis=1)
    row_prob_1 = row_pre.to_numpy()
    row_prob = np.reshape(row_prob_1, (rows,1))

    theta = np.log(row_prob/(1-row_prob))

    # beta initialization
    col_pre = p_df.mean(axis=0)
    col_prob_1 = col_pre.to_numpy()
    col_prob = np.array([col_prob_1])
    beta0 = np.log(col_prob/(1-col_prob))
    beta = np.mean(beta0) - beta0

    # alpha initialization
    alpha = np.ones((1,columns))

    A = 0.002   # learning rate

    alpha_mod, beta_mod, theta_mod, KLDs_mod, KLDs_test_mod = opt_model(alpha, beta, theta, p_df, test_gagongs[num_iter], 500)

    carrier_shell.append(KLDs_mod)         # 0
    carrier_shell.append(KLDs_test_mod)    # 1
    carrier_shell.append(alpha_mod)        # 2
    carrier_shell.append(beta_mod)         # 3
    carrier_shell.append(theta_mod)        # 4

    albetheKLD.append(carrier_shell)
    num_iter += 1



In [None]:
num_tests = []
for vect in test_gagongs:
    egg = vect.notnull().sum().sum()
    num_tests.append(egg)
    
print(num_tests)

## Data Storage

In [None]:
num_save = 0
for carrying in albetheKLD:
    num_save+=1
    
    KLD_trained = carrying[0]
    KLD_tested = carrying[1]
    
    KLDs_train_pd = pd.DataFrame(KLD_trained)
    KLDs_test_pd = pd.DataFrame(KLD_tested)

    KLDs_train_pd.to_csv("UIRT{0}_KLDs_0.1train_0720.csv".format(num_save))
    KLDs_test_pd.to_csv("UIRT{0}_KLDs_0.1test_0720.csv".format(num_save))

In [None]:
num_save = 0
for carrying in albetheKLD:
    num_save+=1
    
    alpha_tested = carrying[2]
    beta_tested = carrying[3]
    theta_tested = carrying[4]
    
    alpha_tested_pd = pd.DataFrame(alpha_tested)
    beta_tested_pd = pd.DataFrame(beta_tested)
    theta_tested_pd = pd.DataFrame(theta_tested)
    
    alpha_tested_pd.to_csv("UIRT{0}_alpha_0.1_0701.csv".format(num_save))
    beta_tested_pd.to_csv("UIRT{0}_beta_0.1_0701.csv".format(num_save))
    theta_tested_pd.to_csv("UIRT{0}_theta_0.1_0701.csv".format(num_save))

## Test set vs Train set and Save the Final Data

In [None]:
def expect_simple_cal(alpha_let, beta_let, theta_let):
    cal1 = np.exp(alpha_let * (theta_let - beta_let))/(1+np.exp(alpha_let * (theta_let - beta_let)))
    
    if cal1 >= 0.99:
        cal1 = 0.99
    elif cal1 <= 0.01:
        cal1 = 0.01

    cal_result = cal1                                    # 0 혹은 1 양극단 삭제

    return cal_result                                           # 숫자로 return

In [None]:
basket_final = []
num_unit = 0
#test_trials = []
#test_trials.append(test_gagongs[0])
for gagong_unit in test_gagongs:
#for gagong_unit in test_trials:
    # pick up a basket of parameters
    basket_picks = albetheKLD.copy()[num_unit]

    # update the index
    num_unit += 1
        
    # theta에 index 묻히기
    theta_fin_df = pd.DataFrame(basket_picks[4])
        
    # alpha와 beta에 column index 묻히기
    alpha_fin_df = pd.DataFrame(basket_picks[2])
    beta_fin_df = pd.DataFrame(basket_picks[3])

    alpha_fin_df.columns = fil4.columns.to_list()
    beta_fin_df.columns = fil4.columns.to_list()
    
    # theta, alpha, beta 모두를 끼워넣기
    threshed_theta = []

    # Set the coordinate
    coord_pd = pd.read_csv("UIRT_0.1testset_{0}_0701.csv".format(num_unit))
    coord_pd.drop(['Unnamed: 0'], axis=1, inplace=True)
    sampl_len = coord_pd.shape[1]
    
    for th in range(sampl_len):
        piece_th = []
        
        coord_x = int(coord_pd.loc[0][th])
        coord_col = coord_pd.loc[1][th]
        correctness = float(coord_pd.loc[2][th])

        theta_piece = theta_fin_df.loc[coord_x][0]
        alpha_piece = alpha_fin_df.loc[0][coord_col]
        beta_piece = beta_fin_df.loc[0][coord_col]
        expect_cal = expect_simple_cal(alpha_piece, beta_piece, theta_piece)

        piece_th.append(coord_x)         # 0
        piece_th.append(coord_col)       # 1
        piece_th.append(correctness)     # 2
        piece_th.append(theta_piece)     # 3
        piece_th.append(alpha_piece)     # 4
        piece_th.append(beta_piece)      # 5
        piece_th.append(expect_cal)      # 6
        threshed_theta.append(piece_th)  # th

    #print(threshed_theta)       # 순서: 학생 index, 문제 index, 문제 정오, 
                                        # theta, alpha, beta, 모델계산값
    
    threshed_pick = threshed_theta.copy()
    
    # 정오답 확실히 판정했는지 여부 판단
    stud_info_simple = []
    num_R = 0
    num_W = 0
    num_tot = 0

    for student in threshed_pick:
        carrier_bot = []
        data_real = student[2]
        data_cal = student[6]
        data_jud = 0
        data_RW = ''

#        if data_cal >=0.7:                 # rounding off to the nearest integer
#            data_jud = 0.99
        if data_cal >=0.5:                 # rounding off to the nearest integer
            data_jud = 0.99
#        elif data_cal <= 0.3:
#            data_jud = 0.01
#        else:
#            data_jud = 0.5
        else:
            data_jud = 0.01

        if data_real == data_jud:
            data_RW = 'O'                   # 'O'는 model이 실제 정오답 여부를 올바르게 판정했다는 의미
            num_R += 1
            num_tot += 1
        else:
            data_RW = 'X'                   # 'X'는 model이 실제 정오답 여부를 올바르게 판정하지 못했다는 의미
            num_W += 1
            num_tot +=1

        carrier_bot.append(student[0])
        carrier_bot.append(student[1])
        carrier_bot.append(data_RW)
        stud_info_simple.append(carrier_bot)

    #print(stud_info_simple)
    print("{0}번째 판정 성공률: {1}".format(num_unit, num_R/num_tot * 100))
    
    # 판정 결과 데이터로 저장
    threshed_pick_df = pd.DataFrame(threshed_pick)
    threshed_pick_df.to_csv("Judgement_mid_UIRT{0}_0701.csv".format(num_unit))
    
    # 보기 좋게 다시 정리
    stud_info_np = np.array(stud_info_simple)
    stud_info_T = np.transpose(stud_info_np)
    #print(stud_info_T)
    stud_info_df = pd.DataFrame(stud_info_T)
    stud_info_df.rename(index={0: "Stud #", 1: "Prob #", 2: "Judge"}, inplace=True)
    #print(stud_info_df)
    
    # 보여주기 및 저장
    basket_final.append(stud_info_df)
    stud_info_df.to_csv("Judgement_fin_UIRT{0}_0701.csv".format(num_unit))
    