In [289]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import cosine
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LinearRegression, Ridge
from scipy.stats import pearsonr

from ast import literal_eval


In [290]:
# Load the pretrained embeddings
embeddings = model


In [291]:
def create_answer_embeddings(answer_text):
    words = literal_eval(answer_text)
    answer_vector = np.zeros(embeddings.vector_size)
    for word in words:
        # if word in embeddings:
        #     answer_vector += embeddings.get_vector(f"/c/af/{word}")

        try:
            answer_vector += embeddings[f"/c/en/{word}"]
        except:
            answer_vector += np.zeros((embeddings.vector_size,), dtype=np.float32)
    return answer_vector


In [302]:
# Load the dataset
data = pd.read_csv("data/processed/data/stemmed_data/datasets/beetle.csv")

# Create embeddings for student and reference answers
data['student_answer_embedding'] = data['student_answer'].apply(create_answer_embeddings)
data['reference_answer_embedding'] = data['reference_answer'].apply(create_answer_embeddings)


In [303]:
def cosine_similarity(vec1, vec2):

    return 1 - cosine(vec1, vec2)

data['cosine_similarity'] = data.apply(lambda row: cosine_similarity(row['student_answer_embedding'], row['reference_answer_embedding']), axis=1)


  dist = 1.0 - uv / np.sqrt(uu * vv)


In [304]:
X = data['cosine_similarity'].values.reshape(-1, 1)
y = data['assigned_points']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train regression models
isotonic_regression = IsotonicRegression().fit(X_train, y_train)
linear_regression = LinearRegression().fit(X_train, y_train)
ridge_regression = Ridge().fit(X_train, y_train)


In [305]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    pearson_corr, _ = pearsonr(y_test, y_pred)
    return rmse, pearson_corr

isotonic_rmse, isotonic_pearson = evaluate_model(isotonic_regression, X_test, y_test)
linear_rmse, linear_pearson = evaluate_model(linear_regression, X_test, y_test)
ridge_rmse, ridge_pearson = evaluate_model(ridge_regression, X_test, y_test)

print("Isotonic Regression: RMSE =", isotonic_rmse, "Pearson correlation =", isotonic_pearson)
print("Linear Regression: RMSE =", linear_rmse, "Pearson correlation =", linear_pearson)
print("Ridge Regression: RMSE =", ridge_rmse, "Pearson correlation =", ridge_pearson)


Isotonic Regression: RMSE = 0.45627136608342167 Pearson correlation = 0.37715228432930203
Linear Regression: RMSE = 0.4633593913700743 Pearson correlation = 0.3386137299477546
Ridge Regression: RMSE = 0.46335502927691646 Pearson correlation = 0.3386137299477546


In [306]:
def evaluate_model_accuracy(model, X_test, y_test, threshold=0.5):
    y_pred = model.predict(X_test)
    y_pred_rounded = np.round(y_pred).clip(min=0, max=max_points)  # Round predictions and ensure they are within the valid range
    correct_predictions = (y_pred_rounded == y_test).sum()
    accuracy = correct_predictions / len(y_test)
    return accuracy


In [307]:
isotonic_accuracy = evaluate_model_accuracy(isotonic_regression, X_test, y_test)
linear_accuracy = evaluate_model_accuracy(linear_regression, X_test, y_test)
ridge_accuracy = evaluate_model_accuracy(ridge_regression, X_test, y_test)

print("Isotonic Regression Accuracy:", isotonic_accuracy)
print("Linear Regression Accuracy:", linear_accuracy)
print("Ridge Regression Accuracy:", ridge_accuracy)


Isotonic Regression Accuracy: 0.6832829808660624
Linear Regression Accuracy: 0.6792547834843907
Ridge Regression Accuracy: 0.6797583081570997


In [155]:
data[["assigned_points", "cosine_similarity"]]
data.loc[6617]

row_id                                                                     6617
question                                          ['explain', 'your', 'reason']
question_id                                                                 130
student_answer                ['remov', 'either', 'a', 'or', 'i', 'will', 't...
reference_answer              ['a', 'and', 'i', 'are', 'in', 'the', 'same', ...
assigned_points                                                               0
max_points                                                                    1
domain                                                                      NaN
dataset_name                                                             beetle
student_answer_embedding      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
reference_answer_embedding    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
cosine_similarity                                                             1
Name: 6617, dtype: object

In [42]:

# import dfs
df = pd.read_parquet("word_embedding/data/stemmed_data/glove/texas.parquet")
# df = pd.read_csv("data/processed/data/raw_data/datasets/beetle.csv")

In [92]:
embeddings["/c/af/aag"]

array([ 0.0226,  0.0745,  0.0331,  0.0765,  0.0921, -0.0657, -0.139 ,
       -0.0472, -0.0178, -0.0736,  0.0704, -0.0176,  0.0032,  0.237 ,
        0.031 , -0.1603,  0.1673, -0.0316,  0.1149, -0.0041,  0.0202,
        0.0915, -0.1503, -0.017 ,  0.0365, -0.0741, -0.0301, -0.0741,
        0.0945,  0.0716, -0.0032, -0.1123, -0.0131, -0.051 ,  0.0027,
        0.1037,  0.0082,  0.0777,  0.0051, -0.0321,  0.0438, -0.0461,
       -0.1083, -0.0712, -0.0657, -0.0612,  0.0858,  0.0103, -0.0125,
       -0.0703, -0.0069, -0.0402,  0.0583, -0.0472,  0.0133, -0.0494,
       -0.0308, -0.1488, -0.0525,  0.0102, -0.0505,  0.0354, -0.1183,
        0.0746,  0.0045,  0.0175,  0.0539,  0.0127, -0.0227, -0.0104,
        0.0053, -0.0061,  0.0358,  0.0066,  0.1255,  0.1093,  0.0614,
       -0.0521,  0.0847, -0.0658,  0.0364, -0.0892,  0.0539, -0.0189,
       -0.0092,  0.0876, -0.0075,  0.0147, -0.0616,  0.0876, -0.0151,
        0.0243, -0.004 , -0.0531,  0.0074,  0.0146, -0.0635,  0.0309,
       -0.0058,  0.0

In [91]:
embeddings.index_to_key

['/c/af/1_konings',
 '/c/af/2_konings',
 '/c/af/a.s',
 '/c/af/a_foei_tog',
 '/c/af/a_ja_a',
 '/c/af/a_nee_a',
 '/c/af/aag',
 '/c/af/aai',
 '/c/af/aak',
 '/c/af/aaklig',
 '/c/af/aakligheid',
 '/c/af/aal',
 '/c/af/aalmoes',
 '/c/af/aalmoesenier',
 '/c/af/aalmoeseniershuis',
 '/c/af/aalmoesgewer',
 '/c/af/aaltjie',
 '/c/af/aalwee',
 '/c/af/aalwurm',
 '/c/af/aalwyn',
 '/c/af/aambeeld',
 '/c/af/aambeeldvoël',
 '/c/af/aambei',
 '/c/af/aamborstig',
 '/c/af/aamborstigheid',
 '/c/af/aan',
 '/c/af/aanbehoort',
 '/c/af/aanbel',
 '/c/af/aanbestee',
 '/c/af/aanbetref',
 '/c/af/aanbetrou',
 '/c/af/aanbeveel',
 '/c/af/aanbeveelbaar',
 '/c/af/aanbevelenswaardig',
 '/c/af/aanbeveling',
 '/c/af/aanbid',
 '/c/af/aanbidbaar',
 '/c/af/aanbidbaarheid',
 '/c/af/aanbiddelik',
 '/c/af/aanbiddelikheid',
 '/c/af/aanbied',
 '/c/af/aanbieder',
 '/c/af/aanbieding',
 '/c/af/aanbind',
 '/c/af/aanblik',
 '/c/af/aanbly',
 '/c/af/aanbod',
 '/c/af/aanboor',
 '/c/af/aanbots',
 '/c/af/aanbou',
 '/c/af/aanbreek',
 '/c/af/aa

In [134]:
data.head(1)

Unnamed: 0,row_id,question,question_id,student_answer,reference_answer,assigned_points,max_points,domain,dataset_name,student_answer_embedding,reference_answer_embedding,cosine_similarity
0,0,What role does the path play in determining wh...,0,if that switch is with the path between that b...,If a bulb and a switch are in the same path th...,1,1,,beetle,"[0.08319999650120735, -0.005600005388259888, -...","[0.09849999845027924, 0.13850000500679016, 0.0...",0.023739


In [267]:
def create_answer_embeddings_test(answer_text):
    words = literal_eval(answer_text)
    answer_vector = np.zeros(embeddings.vector_size)
    for word in words:
        try:
            answer_vector += embeddings[f"/c/en/{word}"]
        except:
            answer_vector += np.zeros((embeddings.vector_size,), dtype=np.float32)
    return answer_vector


In [288]:
row_id = 8
st = create_answer_embeddings_test(data.loc[row_id, "student_answer"])
refa = create_answer_embeddings_test(data.loc[row_id, "reference_answer"])
points = data.loc[0, "assigned_points"]
max_points = data.loc[0, "max_points"]

print(1 - cosine(st, refa), "assigned points:", points, "max points:", max_points)


0.9460464900904633 assigned points: 1 max points: 1


In [280]:
# embeddings[f"/c/en/bulb"]

In [252]:
literal_eval(data.loc[row_id, "student_answer"])


['the', 'path', 'can', 'not', 'be', 'open']

In [220]:
# data.loc[0, "reference_answer"]


In [19]:
my_string = df.loc[0, "student_answer"]
word_list = my_string.split()
print(word_list)


['if', 'that', 'switch', 'is', 'with', 'the', 'path', 'between', 'that', 'bulb', 'and', 'the', 'battery']


In [23]:
# import model
# model = KeyedVectors.load("word_embedding/models/saved_models/conceptnet.bin")
model = gensim_api.load("conceptnet-numberbatch-17-06-300")


In [33]:
model.index_to_key

['/c/af/1_konings',
 '/c/af/2_konings',
 '/c/af/a.s',
 '/c/af/a_foei_tog',
 '/c/af/a_ja_a',
 '/c/af/a_nee_a',
 '/c/af/aag',
 '/c/af/aai',
 '/c/af/aak',
 '/c/af/aaklig',
 '/c/af/aakligheid',
 '/c/af/aal',
 '/c/af/aalmoes',
 '/c/af/aalmoesenier',
 '/c/af/aalmoeseniershuis',
 '/c/af/aalmoesgewer',
 '/c/af/aaltjie',
 '/c/af/aalwee',
 '/c/af/aalwurm',
 '/c/af/aalwyn',
 '/c/af/aambeeld',
 '/c/af/aambeeldvoël',
 '/c/af/aambei',
 '/c/af/aamborstig',
 '/c/af/aamborstigheid',
 '/c/af/aan',
 '/c/af/aanbehoort',
 '/c/af/aanbel',
 '/c/af/aanbestee',
 '/c/af/aanbetref',
 '/c/af/aanbetrou',
 '/c/af/aanbeveel',
 '/c/af/aanbeveelbaar',
 '/c/af/aanbevelenswaardig',
 '/c/af/aanbeveling',
 '/c/af/aanbid',
 '/c/af/aanbidbaar',
 '/c/af/aanbidbaarheid',
 '/c/af/aanbiddelik',
 '/c/af/aanbiddelikheid',
 '/c/af/aanbied',
 '/c/af/aanbieder',
 '/c/af/aanbieding',
 '/c/af/aanbind',
 '/c/af/aanblik',
 '/c/af/aanbly',
 '/c/af/aanbod',
 '/c/af/aanboor',
 '/c/af/aanbots',
 '/c/af/aanbou',
 '/c/af/aanbreek',
 '/c/af/aa

In [28]:
word_list

['if',
 'that',
 'switch',
 'is',
 'with',
 'the',
 'path',
 'between',
 'that',
 'bulb',
 'and',
 'the',
 'battery']

In [37]:
model.get_vector(f"/c/af/{word_list[-1]}")


array([-0.0363, -0.1358, -0.0527,  0.0226, -0.1354, -0.0417, -0.1199,
       -0.0663,  0.0333, -0.0071, -0.1109, -0.0812,  0.0902,  0.1449,
        0.0822,  0.0695,  0.0428, -0.1033, -0.0348, -0.0261, -0.0357,
        0.0167,  0.0755,  0.0849,  0.0576,  0.097 ,  0.0927, -0.0047,
        0.1087,  0.0689,  0.0276, -0.0383,  0.1006, -0.0454, -0.1038,
        0.0451,  0.0385, -0.012 , -0.0018,  0.0457,  0.0059, -0.0763,
        0.0521, -0.0638, -0.0055,  0.0523, -0.001 ,  0.1151,  0.1275,
       -0.0042, -0.1334, -0.0396,  0.0987,  0.0104, -0.0799,  0.0468,
       -0.0086,  0.0639, -0.0344,  0.0105, -0.037 , -0.0048,  0.0825,
        0.0618,  0.0059, -0.0591,  0.0457, -0.0053,  0.0383,  0.0211,
        0.0271, -0.125 , -0.022 , -0.0462, -0.0887, -0.0643,  0.0019,
       -0.0164,  0.0467,  0.035 , -0.0168,  0.0141,  0.032 , -0.0069,
        0.0392,  0.0301,  0.0567,  0.0112, -0.0016, -0.037 , -0.0316,
       -0.0751,  0.0417, -0.0253, -0.0689, -0.0895,  0.0417, -0.06  ,
        0.044 , -0.0