In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

from keras.utils import to_categorical
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Flatten, Dense, Dropout
from keras.models import Sequential

from sklearn.decomposition import PCA

from sklearn.neural_network import MLPRegressor

import numpy as np

import xgboost as xgb

In [2]:
url = "https://raw.githubusercontent.com/Tejanikhil/Shared-ASAG/main"
df = pd.read_csv(url + "/SemEvalData.csv")

In [3]:
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords

# Download the stopwords corpus if not already downloaded
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
def extract_pos(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    pos_words = [word for word, pos in pos_tags if pos.startswith('VB') or pos.startswith('JJ') or pos.startswith('NN')]
    return ' '.join(pos_words)

def process_dataset(text):
    lower_cased = text.lower()
    tokens = nltk.word_tokenize(lower_cased)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    preprocessed = ' '.join(filtered_tokens)
    return preprocessed

In [5]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(df.loc[:,["Actual Answer","Student Answer"]], df["Grade"], test_size=0.1, random_state=42)

In [6]:
TRAIN = train_X
TRAIN["LABELS"] = train_y
TEST = test_X
TEST["LABELS"] = test_y

In [7]:
train_X["Actual Answer"] = train_X["Actual Answer"].apply(extract_pos).apply(process_dataset)
train_X["Student Answer"] = train_X["Student Answer"].apply(extract_pos).apply(process_dataset)

test_X["Actual Answer"] = test_X["Actual Answer"].apply(extract_pos).apply(process_dataset)
test_X["Student Answer"] = test_X["Student Answer"].apply(extract_pos).apply(process_dataset)

In [8]:
print(TRAIN.shape)
TRAIN = TRAIN.replace("", pd.NA).dropna()
print(TRAIN.shape)
TEST = TEST.replace("", pd.NA).dropna()

(2197, 3)
(2184, 3)


In [9]:
train_X = TRAIN.drop("LABELS", axis = 1)
train_y = TRAIN.iloc[:,-1]
test_X = TEST.drop("LABELS", axis = 1)
test_y = TEST.iloc[:,-1]

In [10]:
sentences = list(train_X["Actual Answer"]) + list(train_X["Student Answer"])
sentences = [i.split() for i in sentences]

In [11]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [12]:
train_Embeddings_df = pd.DataFrame({"Student_Answer_Embeddings":train_X["Student Answer"], "Reference_Answer_Embeddings":train_X["Actual Answer"]})
test_Embeddings_df = pd.DataFrame({"Student_Answer_Embeddings":test_X["Student Answer"], "Reference_Answer_Embeddings":test_X["Actual Answer"]})

In [None]:
vocab = []
for words_list in sentences:
  vocab.extend(words_list)

In [13]:
model = Word2Vec(sentences, min_count=1)

In [14]:
def get_wordvec_embeddings(data, default_embedding=np.zeros(100)):

  words = data.split()
  embedding = []
  for word in words:
      if word in vocab:
          embedding.append(model.wv[word])
      else:
          embedding.append(default_embedding)

  return np.mean(np.array(embedding),axis = 0)

In [15]:
train_Embeddings_df["Student_Answer_Embeddings"] = train_Embeddings_df["Student_Answer_Embeddings"].apply(get_wordvec_embeddings)
train_Embeddings_df["Reference_Answer_Embeddings"] = train_Embeddings_df["Reference_Answer_Embeddings"].apply(get_wordvec_embeddings)

test_Embeddings_df["Student_Answer_Embeddings"] = test_Embeddings_df["Student_Answer_Embeddings"].apply(get_wordvec_embeddings)
test_Embeddings_df["Reference_Answer_Embeddings"] = test_Embeddings_df["Reference_Answer_Embeddings"].apply(get_wordvec_embeddings)

In [16]:
train_X["Subtracted_Embeddings"] = train_Embeddings_df.apply(lambda x:x["Student_Answer_Embeddings"]-x["Reference_Answer_Embeddings"], axis = 1)
test_X["Subtracted_Embeddings"] = test_Embeddings_df.apply(lambda x:x["Student_Answer_Embeddings"]-x["Reference_Answer_Embeddings"], axis = 1)

In [17]:
train_df = pd.DataFrame(list(train_X["Subtracted_Embeddings"]))
test_df = pd.DataFrame(list(test_X["Subtracted_Embeddings"]))

In [18]:
pca = PCA(n_components=100)  # Specify the number of components you want to keep
train_df_pca = pca.fit_transform(train_df)

In [19]:
test_df_pca = pca.transform(test_df)

In [20]:
ncols = train_df_pca.shape[1]

In [21]:
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(ncols,1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(train_df_pca, train_y, epochs=10, batch_size=32, verbose=1)

y_true = test_y
y_preds = model.predict(test_df_pca)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
cnn_mae = mean_absolute_error(y_true, np.round(y_preds, 1))
cnn_mse = mean_squared_error(y_true, np.round(y_preds, 1))
cnn_rmse = mean_squared_error(y_true, np.round(y_preds, 1), squared=False)
cnn_mape = mean_absolute_percentage_error(y_true, y_preds, multioutput = "uniform_average")
cnn_r2 = r2_score(y_true, y_preds)

print("Mean Absolute Error (MAE): ", np.round(cnn_mae, 2))
print("Mean Squared Error (MSE): ", np.round(cnn_mse, 2))
print("Root Mean Squared Error (RMSE): ", np.round(cnn_rmse, 2))
print("Mean Absolute Percentage Error (MAPE): ", np.round(cnn_mape, 2))
print("R2 Score: ", np.round(cnn_r2, 2))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean Absolute Error (MAE):  1.21
Mean Squared Error (MSE):  1.96
Root Mean Squared Error (RMSE):  1.4
Mean Absolute Percentage Error (MAPE):  540054709899849.6
R2 Score:  0.01


In [22]:
mlp = MLPRegressor(hidden_layer_sizes=(100,50,25,1), activation='identity', solver="")

# Train the MLP regressor
mlp.fit(train_df_pca, train_y)

y_preds = mlp.predict(test_df_pca)

y_true = test_y

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
mlp_mae = mean_absolute_error(y_true, np.round(y_preds, 1))
mlp_mse = mean_squared_error(y_true, np.round(y_preds, 1))
mlp_rmse = mean_squared_error(y_true, np.round(y_preds, 1), squared=False)
mlp_mape = mean_absolute_percentage_error(y_true, y_preds, multioutput = "uniform_average")
mlp_r2 = r2_score(y_true, y_preds)

print("Mean Absolute Error (MAE): ", np.round(mlp_mae, 2))
print("Mean Squared Error (MSE): ", np.round(mlp_mse, 2))
print("Root Mean Squared Error (RMSE): ", np.round(mlp_rmse, 2))
print("Mean Absolute Percentage Error (MAPE): ", np.round(mlp_mape, 2))
print("R2 Score: ", np.round(mlp_r2, 2))

Mean Absolute Error (MAE):  1.09
Mean Squared Error (MSE):  1.76
Root Mean Squared Error (RMSE):  1.33
Mean Absolute Percentage Error (MAPE):  479137527743748.94
R2 Score:  0.11


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [23]:
from sklearn.linear_model import LinearRegression
LR_model = LinearRegression()
LR_model.fit(train_df_pca, train_y)

preds = LR_model.predict(test_df_pca)
y_preds = [round(y_val, 1) for y_val in preds]
actuals = test_y
y_true = [round(y_val, 1) for y_val in actuals]

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
LR_mae = mean_absolute_error(y_true, np.round(y_preds, 1))
LR_mse = mean_squared_error(y_true, np.round(y_preds, 1))
LR_rmse = mean_squared_error(y_true, np.round(y_preds, 1), squared=False)
LR_mape = mean_absolute_percentage_error(y_true, y_preds, multioutput = "uniform_average")
LR_r2 = r2_score(y_true, y_preds)

print("Mean Absolute Error (MAE): ", np.round(LR_mae, 2))
print("Mean Squared Error (MSE): ", np.round(LR_mse, 2))
print("Root Mean Squared Error (RMSE): ", np.round(LR_rmse, 2))
print("Mean Absolute Percentage Error (MAPE): ", np.round(LR_mape, 2))
print("R2 Score: ", np.round(LR_r2, 2))

Mean Absolute Error (MAE):  1.09
Mean Squared Error (MSE):  1.77
Root Mean Squared Error (RMSE):  1.33
Mean Absolute Percentage Error (MAPE):  468741998197647.5
R2 Score:  0.12


In [24]:
from sklearn import neighbors
rmse_val = []
mae_val = []
mse_val = []
r2_val = []
for K in range(2,20):
    knn_model = neighbors.KNeighborsRegressor(n_neighbors = K)
    knn_model.fit(train_df_pca, train_y)  #fit the model
    pred=knn_model.predict(test_df_pca) #make prediction on test set
    
    mae = mean_absolute_error(y_true, np.round(pred, 1))
    mae_val.append(mae)
    
    mse = mean_squared_error(y_true, np.round(pred, 1))
    mse_val.append(mse)
    
    rmse = mean_squared_error(y_true, np.round(pred, 1), squared=False)
    rmse_val.append(rmse)
    
    r2 = r2_score(y_true, pred)
    r2_val.append(r2)

y_true = test_y
knn_preds = knn_model.predict(test_df_pca)

min_val = min(r2_val)
index = r2_val.index(min_val)
knn_r2 = min_val
knn_rmse = rmse_val[index]
knn_mae = mae_val[index]
knn_mse = mse_val[index]

print("Mean Absolute Error (MAE): ", np.round(knn_mae, 2))
print("Mean Squared Error (MSE): ", np.round(knn_mse, 2))
print("Root Mean Squared Error (RMSE): ", np.round(knn_rmse, 2))
print("R2 Score: ", np.round(knn_r2, 2))

Mean Absolute Error (MAE):  1.21
Mean Squared Error (MSE):  2.77
Root Mean Squared Error (RMSE):  1.66
R2 Score:  -0.37


In [25]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
ada_reg = AdaBoostRegressor(n_estimators = 100, learning_rate=0.5)

ada_reg.fit(train_df, train_y)

ada_reg_pred = ada_reg.predict(pd.DataFrame(test_df))
ada_reg_mae = mean_absolute_error(test_y, np.round(ada_reg_pred, 1))
ada_reg_mse = mean_squared_error(test_y, np.round(ada_reg_pred, 1))
ada_reg_mape = mean_absolute_percentage_error(test_y, np.round(ada_reg_pred,1), multioutput = "uniform_average")
ada_reg_rmse = mean_squared_error(test_y, np.round(ada_reg_pred, 1), squared=False)
ada_reg_r2 = r2_score(test_y, ada_reg_pred)

print("Mean Absolute Error (MAE): ", np.round(ada_reg_mae, 2))
print("Mean Squared Error (MSE): ", np.round(ada_reg_mse, 2))
print("Root Mean Squared Error (RMSE): ", np.round(ada_reg_rmse, 2))
print("Mean Absolute Percentage Error (MAPE): ", np.round(ada_reg_mape, 2))
print("R2 Score: ", np.round(ada_reg_r2, 2))

Mean Absolute Error (MAE):  1.35
Mean Squared Error (MSE):  2.3
Root Mean Squared Error (RMSE):  1.52
Mean Absolute Percentage Error (MAPE):  485285837398290.5
R2 Score:  -0.17


In [26]:
gb_reg = GradientBoostingRegressor(n_estimators = 50, max_depth=5)

gb_reg.fit(train_df_pca, train_y)

gb_reg_pred = gb_reg.predict(pd.DataFrame(test_df_pca))
gb_reg_mae = mean_absolute_error(test_y, np.round(gb_reg_pred, 1))
gb_reg_mse = mean_squared_error(test_y, np.round(gb_reg_pred, 1))
gb_reg_mape = mean_absolute_percentage_error(test_y, np.round(gb_reg_pred,1), multioutput = "uniform_average")
gb_reg_rmse = mean_squared_error(test_y, np.round(gb_reg_pred, 1), squared=False)
gb_reg_r2 = r2_score(test_y, gb_reg_pred)

print("Mean Absolute Error (MAE): ", np.round(gb_reg_mae, 2))
print("Mean Squared Error (MSE): ", np.round(gb_reg_mse, 2))
print("Root Mean Squared Error (RMSE): ", np.round(gb_reg_rmse, 2))
print("Mean Absolute Percentage Error (MAPE): ", np.round(gb_reg_mape, 2))
print("R2 Score: ", np.round(gb_reg_r2, 2))

Mean Absolute Error (MAE):  1.08
Mean Squared Error (MSE):  1.76
Root Mean Squared Error (RMSE):  1.33
Mean Absolute Percentage Error (MAPE):  457712778455205.8
R2 Score:  0.11


In [27]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', seed = 123)

xg_reg.fit(train_df_pca, train_y)

# print("Best parameters: ", xg_boost_grid_search.best_params_)
# print("Best score: ", xg_boost_grid_search.best_score_)
xg_reg_pred = xg_reg.predict(pd.DataFrame(test_df_pca))
xg_reg_mae = mean_absolute_error(test_y, np.round(xg_reg_pred, 1))
xg_reg_mse = mean_squared_error(test_y, np.round(xg_reg_pred, 1))
xg_reg_mape = mean_absolute_percentage_error(test_y, np.round(xg_reg_pred,1), multioutput = "uniform_average")
xg_reg_rmse = mean_squared_error(test_y, np.round(xg_reg_pred, 1), squared=False)
xg_reg_r2 = r2_score(test_y, xg_reg_pred)

print("Mean Absolute Error (MAE): ", np.round(xg_reg_mae, 2))
print("Mean Squared Error (MSE): ", np.round(xg_reg_mse, 2))
print("Root Mean Squared Error (RMSE): ", np.round(xg_reg_rmse, 2))
print("Mean Absolute Percentage Error (MAPE): ", np.round(xg_reg_mape, 2))
print("R2 Score: ", np.round(xg_reg_r2, 2))

Mean Absolute Error (MAE):  1.11
Mean Squared Error (MSE):  2.11
Root Mean Squared Error (RMSE):  1.45
Mean Absolute Percentage Error (MAPE):  437492532162443.25
R2 Score:  -0.06
