In [2]:
%cd '/content/drive/MyDrive/MTP I/code'

/content/drive/MyDrive/MTP I/code


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!ls

task3_question_evaluate.ipynb


In [4]:
import pandas as pd
import scipy as sp
from scipy.stats import multinomial
import os
import numpy as np
import math

In [5]:
def entropy(c):
    # calculate entropy
    result=-1
    if(len(c)>0):
        result=0
    for x in c:
        result+=(-x)*math.log(x,2)
    return result

In [6]:
# load the training data
df = pd.read_csv('/content/drive/MyDrive/MTP I/data/train_data/train_task_3_4.csv')
# load side information
answer_meta_data = pd.read_csv('/content/drive/MyDrive/MTP I/data/metadata/answer_metadata_task_3_4.csv')
question_meta_data = pd.read_csv('/content/drive/MyDrive/MTP I/data/metadata/question_metadata_task_3_4.csv')
student_meta_data = pd.read_csv('/content/drive/MyDrive/MTP I/data/metadata/student_metadata_task_3_4.csv')

In [7]:
# load the submission csv file
submission_file = pd.read_csv('/content/drive/MyDrive/MTP I/data/train_data/submission_task_3.csv')

### calculate the entropy of choice

In [10]:
import pandas as pd
from scipy.stats import entropy
import numpy as np

# Sample DataFrames for illustration
# df = pd.DataFrame({
#     'QuestionId': [1, 1, 1, 2, 2, 3, 3, 3, 3],
#     'AnswerValue': ['a', 'b', 'a', 'b', 'b', 'a', 'c', 'c', 'a']
# })

# Calculate choice entropy, ensuring valid probability distributions
choice_entropy = df.groupby('QuestionId')['AnswerValue'].agg(
    lambda x: entropy(x.value_counts(normalize=True).values)
)

submission_file['entropy_choice'] = submission_file['QuestionId'].map(choice_entropy)
submission_file['z_entropy_choice'] = (submission_file['entropy_choice'] - np.mean(submission_file['entropy_choice'])) / np.std(submission_file['entropy_choice'])


### calculate the question confidence

In [11]:
new_df = df.merge(answer_meta_data, on='AnswerId', how='left')
notnull_confidence = new_df[new_df['Confidence'].notnull()]
que_num = notnull_confidence.groupby('QuestionId')['QuestionId'].agg(lambda x: len(x))
num_confid = {}
for i in que_num.index:
    num_confid[i] = que_num[i]
valid_que = []
for key, value in num_confid.items():
    if value < 20:
        pass
    else:
        valid_que.append(key)
notnull_confidence = notnull_confidence[notnull_confidence['QuestionId'].isin(valid_que)]
que_avg_confid = {}
for idx in notnull_confidence['QuestionId'].unique():
    cut = notnull_confidence[notnull_confidence['QuestionId']==idx]
    que_avg_confid[idx] = cut['Confidence'].mean()
all_que_confid = list(que_avg_confid.values())
submission_file['confidence'] = submission_file['QuestionId'].apply(lambda x: que_avg_confid[x] if x in que_avg_confid else np.mean(all_que_confid))
submission_file['z_confidence'] = (submission_file['confidence']-np.mean(submission_file['confidence']))/np.std(submission_file['confidence'])

### calculate the answer right&wrong entropy

In [13]:
import pandas as pd
from scipy.stats import entropy
import numpy as np

# Sample DataFrames for illustration
# df = pd.DataFrame({
#     'QuestionId': [1, 1, 1, 2, 2, 3, 3, 3, 3],
#     'IsCorrect': [1, 0, 1, 0, 0, 1, 0, 0, 1]
# })
# submission_file = pd.DataFrame({
#     'QuestionId': [1, 2, 3]
# })

# Calculate right entropy
right_entropy = df.groupby('QuestionId')['IsCorrect'].agg(
    lambda x: entropy(x.value_counts(normalize=True))
)

# Map the calculated entropy to the submission file
submission_file['right_entropy'] = submission_file['QuestionId'].map(right_entropy)

# Calculate z-score for the entropy
submission_file['z_entropy_right'] = (submission_file['right_entropy'] - np.mean(submission_file['right_entropy'])) / np.std(submission_file['right_entropy'])

### calculate the entropy conditioned on group
H(right&wrong | group)

In [20]:
from scipy.stats import entropy
import numpy as np

def get_one_condition_entropy(one_df):
    # Compute entropy for each group
    # Ensure multinomial.entropy or similar function returns a scalar
    some_res = one_df.groupby('GroupId')['IsCorrect'].agg(lambda x: entropy(x.value_counts(normalize=True)))

    # Initialize score
    one_score = 0

    # Calculate weighted entropy score
    for one_group_idx in some_res.index:
        cut_df = one_df[one_df['GroupId'] == one_group_idx]
        group_entropy = some_res[one_group_idx]
        group_weight = cut_df.shape[0] / one_df.shape[0]
        one_score += group_entropy * group_weight

    return one_score

In [21]:
cond_right_group_entropy = {}
for one_que_id in new_df.QuestionId.unique():
    one_df = new_df[new_df['QuestionId'] == one_que_id]
    # Calculate conditional entropy
    entropy_value = get_one_condition_entropy(one_df)
    cond_right_group_entropy[one_que_id] = entropy_value

submission_file['cond_entropy_group'] = submission_file['QuestionId'].apply(lambda x: cond_right_group_entropy[x])
submission_file['z_cond_entropy'] = (submission_file['cond_entropy_group'] - submission_file['cond_entropy_group'].mean()) / submission_file['cond_entropy_group'].std()


### calculate the entropy conditioned on quiz

In [23]:
from scipy.stats import entropy as multinomial_entropy
import pandas as pd

def get_one_quiz_condition_entropy(one_df):
    # Compute entropy for each QuizId group
    some_res = one_df.groupby('QuizId')['IsCorrect'].agg(
        lambda x: multinomial_entropy(x.value_counts(normalize=True))
    )

    # Check that some_res is a Series with scalar values
    if not isinstance(some_res, pd.Series):
        raise ValueError("some_res should be a pandas Series")

    # Initialize score
    one_score = 0

    # Calculate weighted entropy score
    for one_group_idx in some_res.index:
        cut_df = one_df[one_df['QuizId'] == one_group_idx]
        group_entropy = some_res[one_group_idx]
        group_weight = cut_df.shape[0] / one_df.shape[0]
        one_score += group_entropy * group_weight

    return one_score

# Main computation
cond_right_quiz_entropy = {}
for one_que_id in new_df.QuestionId.unique():
    one_df = new_df[new_df['QuestionId'] == one_que_id]
    # Compute conditional entropy
    cond_right_quiz_entropy[one_que_id] = get_one_quiz_condition_entropy(one_df)

# Apply results to submission file
submission_file['cond_entropy_quiz'] = submission_file['QuestionId'].apply(lambda x: cond_right_quiz_entropy[x])
submission_file['z_cond_quiz_entropy'] = (submission_file['cond_entropy_quiz'] - submission_file['cond_entropy_quiz'].mean()) / submission_file['cond_entropy_quiz'].std()


### final ranking

In [24]:
float0, float_1, float_2 = 0.7, 0.1, 1
submission_file['final_score'] = submission_file['z_entropy_choice'] + float0*submission_file['z_cond_entropy'] + \
float_1* submission_file['z_cond_quiz_entropy'] + \
float_2*submission_file['z_entropy_right'] - submission_file['z_confidence']

In [26]:
ranking = submission_file['final_score'].rank(method='first', ascending=False).astype('int16')
submission_file['ranking'] = ranking
submission_file[['QuestionId','ranking']].to_csv('/content/drive/MyDrive/MTP I/data/train_data/final_report.csv',index=False)
first_try = pd.read_csv('/content/drive/MyDrive/MTP I/data/train_data/final_report.csv')
first_try_zip = first_try.sort_values("ranking", ascending=True)
first_try_zip.to_csv('/content/drive/MyDrive/MTP I/data/train_data/submission_task_3_report.csv',index=False)