In [16]:
from sklearn.model_selection import GridSearchCV,RepeatedStratifiedKFold
from sklearn.svm import SVC
import pandas as pd
import numpy as np

In [17]:
# Function to split data into training and testing sets
def split_data(datb,size_num,N,I):
    # Randomly select samples for training and testing
    num = np.random.choice(N, size_num, replace=False)
    training_data = datb.iloc[num, :]
    testing_data = datb.drop(num, axis=0)

    training_dat = training_data.iloc[:, :I]
    training_alpha1 = training_data.iloc[:, I:]
    testing_dat = testing_data.iloc[:, :I]
    testing_alpha1 = testing_data.iloc[:, I:]
        
    training_alpha = training_alpha1 + 1
    testing_alpha = testing_alpha1 + 1

    return training_dat,training_alpha,testing_dat,testing_alpha

In [18]:
# Function to train the model using a two-step grid search
def train_model_twostep(X_train,y_train,X_test,y_test,attri_num):
    csestA = np.zeros_like(y_test)
    param_info_list = []

    for j in range(attri_num):
        svc = SVC(kernel='rbf')
        # First step grid search
        C_values = 2.0**np.arange(-5, 15, 2)
        gamma_values = 2.0**np.arange(-15, 3, 2)
        param_grid = {'C':C_values,'gamma':gamma_values}

        skf = RepeatedStratifiedKFold(n_splits=2,n_repeats=10)
        grid = GridSearchCV(svc, param_grid, refit=True, cv=skf)
        grid.fit(X_train,y_train.iloc[:,j])

        best_params = grid.best_params_
        best_C = best_params['C']
        best_gamma = best_params['gamma']

        # Second step grid search with finer granularity
        fine_C_values = 2.0**np.arange(np.log2(best_C) - 2, np.log2(best_C) + 2, 0.5)
        fine_gamma_values = 2.0**np.arange(np.log2(best_gamma) - 1, np.log2(best_gamma) + 2, 0.5)
        fine_param_grid = {'C': fine_C_values, 'gamma': fine_gamma_values}
        fine_grid = GridSearchCV(svc, fine_param_grid, refit=True, cv=skf)
        fine_grid.fit(X_train,y_train.iloc[:,j])

        y_pred = fine_grid.predict(X_test)
        csestA[:, j] = y_pred

        param_info_list.append({
            'j': j,
            'best_C': best_C,
            'best_gamma': best_gamma
        })
    S_Wnk = (y_test == csestA).astype(int)

    SACCR = S_Wnk.mean(axis=0)
    SPCCR = np.prod(S_Wnk, axis=1).mean()

    return SACCR,SPCCR,param_info_list, csestA, y_test, S_Wnk

In [26]:
attri_num = 3           #number of attributes
attri_level = [3,3,2]   #number of attribute levels
Item = 40               #number of items
N = 200                 #number of examinees
size_num = 100          #training sample size: [20,30,50,100,150]
cycles = 5              #number of iterations: 30
datb = pd.read_csv('../data/databind.csv')
datb = datb.iloc[:,1:-2]
Qm = pd.read_csv('../data/Qm.csv')

In [20]:
datb

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,I34,I35,I36,I37,I38,I39,I40,a1,a2,a3
0,0,0,0,1,1,1,1,1,1,1,...,1,1,0,1,1,0,1,3,2,1
1,1,1,0,1,1,0,1,1,1,1,...,0,0,1,1,0,1,1,1,1,2
2,0,1,0,0,1,1,1,1,1,1,...,0,1,1,0,1,1,1,3,2,1
3,0,1,0,1,1,1,0,1,1,1,...,0,1,0,0,1,1,0,2,2,1
4,1,1,0,1,1,0,1,1,1,1,...,1,0,1,1,1,0,1,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,1,0,1,1,0,1,1,1,1,...,0,1,0,0,1,1,0,1,2,2
196,0,1,0,0,1,1,1,0,1,1,...,1,1,0,0,0,1,0,2,2,2
197,0,1,0,1,1,0,1,1,1,1,...,0,0,0,0,0,1,1,1,3,1
198,0,1,0,1,1,1,0,0,1,1,...,0,0,0,0,1,1,1,2,2,1


In [21]:
Qm

Unnamed: 0,att1 lev,att2 lev,att3 lev
0,0,3,2
1,1,1,1
2,2,0,2
3,1,0,2
4,1,1,0
5,2,1,0
6,3,0,1
7,0,0,2
8,0,2,0
9,1,2,1


In [27]:
saccr_results = pd.DataFrame()
spccr_results = pd.DataFrame()
all_param_info_list = []
for i in range(cycles):
    X_train,y_train,X_test,y_test = split_data(datb,size_num,N,Item)
    SACCR,SPCCR,param_info_list, csestA, y_test, S_Wnk = train_model_twostep(X_train,y_train,X_test,y_test,attri_num)
    np.savetxt('../result/csestA-'+str(size_num)+'-'+str(i+1)+'.csv', csestA, delimiter=",")
    np.savetxt('../result/y_test-'+str(size_num)+'-'+str(i+1)+'.csv', y_test, delimiter=",")
    np.savetxt('../result/S_Wnk-'+str(size_num)+'-'+str(i+1)+'.csv', S_Wnk, delimiter=",")

    saccr_results = pd.concat([saccr_results, SACCR], axis=1)
    spccr_results = pd.concat([spccr_results, pd.Series([SPCCR], name=f'Run_{i+1}')], axis=1)
    
    for param_info in param_info_list:
        param_info['size_num'] = size_num
        all_param_info_list.append(param_info)

In [28]:
saccr_results = saccr_results.T
spccr_results = spccr_results.T
param_info_df = pd.DataFrame(all_param_info_list)

filename_accr = f"../result/apresult/fSVMACCR-{size_num}.csv"
filename_pccr = f"../result/apresult/fSVMPCCR-{size_num}.csv"
filename_grid = f"../result/gridresult/grid_realdata-{size_num}.csv"
saccr_results.to_csv(filename_accr, index=True)
spccr_results.to_csv(filename_pccr, index=True) 
param_info_df.to_csv(filename_grid, index=True)

In [29]:
saccr_results

Unnamed: 0,a1,a2,a3
0,0.87,0.87,0.97
0,0.9,0.87,0.89
0,0.92,0.91,0.96
0,0.91,0.87,0.9
0,0.86,0.83,0.88


In [30]:
spccr_results

Unnamed: 0,0
Run_1,0.77
Run_2,0.72
Run_3,0.81
Run_4,0.74
Run_5,0.66


In [31]:
param_info_df

Unnamed: 0,j,best_C,best_gamma,size_num
0,0,2.0,0.125,100
1,1,8.0,0.125,100
2,2,8.0,0.125,100
3,0,8.0,0.125,100
4,1,32.0,0.001953,100
5,2,8.0,0.03125,100
6,0,2.0,0.125,100
7,1,2.0,0.125,100
8,2,2.0,0.125,100
9,0,2.0,0.125,100
