In [1]:
#installing required libraries
!pip install openpyxl

#importing required libraries

import numpy as np 
import pandas as pd 

In [2]:
#reading the dataset
# df = pd.read_excel("../input/auto-eval/Re-evaluated Automatic Research Stuff.xlsx", "Endsem G1 Q7", usecols=[0,2,5,6,7], engine="openpyxl")
# df = pd.read_excel("../input/auto-eval/Re-evaluated Automatic Research Stuff.xlsx", "Q11", usecols=[0,3,5,6,7], engine="openpyxl")
df_tutorial1 = pd.read_excel("../input/auto-eval/Tutorial 1_corrected(1-1174) - Re Eval.xlsx", "Q7", usecols=[0,3,5,7], engine="openpyxl")

# df.columns =['username', 'Explanation for question 11 here', 'Rationale(1.5)', 'Assigned Points - Anjali', 'Assigned Points - Sanskriti']
df_tutorial1.columns =['username', 'Explanation for question 11 here', 'Rationale(1.5)', 'Assigned Points - Ranjani']

# Pre Processing

In [3]:
df_tutorial1.info()

In [4]:
#remove data where no explanation is given
df_tutorial1["Explanation for question 11 here"].replace(0, np.nan, inplace = True)
# df["Rationale(1.5)"].replace(-1.5, 0, inplace = True)
# df["Rationale(1.5)"].replace(-0.5, 1, inplace = True)

cleaned_df_t1 = df_tutorial1.dropna()
#can create new column with punctuation removal to delete answers which only have symbols

#remove word for word matches - obviously copied
cleaned_df_t1['Column1_lower'] = x['Explanation for question 11 here'].astype(str).str.lower()
cleaned_df_t1.drop_duplicates(subset = 'Column1_lower', keep = False, inplace = True)
cleaned_df_t1.drop('Column1_lower', axis=1, inplace=True)

cleaned_df_t1.reset_index(inplace=True, drop=True)
cleaned_df_t1.info()

In [5]:
cleaned_df_t1.groupby('Rationale(1.5)').count()

# Text Vectorization - TF IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
text_vec = vectorizer.fit_transform(cleaned_df_t1["Explanation for question 11 here"].astype(str))
print(text_vec.shape)

# Similarity Analysis

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(text_vec,text_vec) #generates a diagonal matrix where each row is the cosine similairty of a given vector with each other tect vector
print("Mean:",np.mean(similarities)) #average of similarity score across all vectors
print("Standard Deviation",np.std(similarities)) 

# Visualisations

In [9]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

#used PCA to reduce the multi-dimensional text vector to 2-dimension to be able to plot it on a x-y plan
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(text_vec)#.toarray())
svd = TruncatedSVD(n_components=2, random_state=42)
reduced_features = svd.fit_transform(text_vec) 

#visualising the text clusters based on colour coded assigned scores
color_dict = {-0.5:-2, -1.5:-1,  0.0:0, 0.5:1, 1.0:2, 1.5:3, 2.0:4,2.5:5,3.0:6}
classes = ['0','0.5', '1', '1.5']
scatter = plt.scatter(reduced_features[:,0], reduced_features[:,1],c = [color_dict[i] for i in x['Assigned Points - Ranjani']])
# plt.figure(figsize=(11, 8))

plt.legend(handles=scatter.legend_elements()[0], labels=classes)
plt.title("Q7 Tutorial 1 - Assigned Points Ranjani")
plt.show()

# Clustering

In [10]:
# from sklearn.cluster import DBSCAN
# from sklearn.cluster import OPTICS
# from sklearn.cluster import KMeans

def cluster(cluster_type, text_vec, minsamples=1, n_clusters=4, epsilon=0.25):
    preds = []
    if cluster_type == "DBSCAN":
        preds = DBSCAN(eps=epsilon,min_samples=minsamples).fit_predict(text_vec)
    elif cluster_type == "OPTICS":
        preds = OPTICS(min_samples=7).fit_predict(text_vec)
    elif cluster_type == "K Means":
        kmeans = KMeans(n_clusters=4, random_state=0).fit(text_vec)
        preds = kmeans.labels_
    return preds


titles = ["DBSCAN", "OPTICS", "K Means"]

for title in titles:
    preds = cluster(title, text_vec)
    df_cleaned_t1['labels_'+title] = preds

#plot the predictions - colour coded by cluster number
scatter = plt.scatter(reduced_features[:,0], reduced_features[:,1], c=preds)
plt.title(title + " Clusters")
plt.legend(handles=scatter.legend_elements()[0], labels=['0','1','2','3'])
plt.show()

# Summary Generation - Generate a summary of all answers in a cluster

In [None]:
!pip install bert-extractive-summarizer
!pip install neuralcoref
!pip install spacy

In [None]:
from summarizer import Summarizer

# model = Summarizer()

body1 = ''
body2 = ''
body3 = ''
body4 = ''
for i in range(len(x)):
    if x.iloc[i]['labels_'+title] == 0:
        body1 = body1 + " " + str(x.iloc[i]['Explanation for question 11 here'])
    elif x.iloc[i]['labels_'+title] == 1:
        body2 = body2 + " " + str(x.iloc[i]['Explanation for question 11 here'])
    elif x.iloc[i]['labels_'+title] == 2:
        body3 = body3 + " " + str(x.iloc[i]['Explanation for question 11 here'])
    elif x.iloc[i]['labels_'+title] == 3:
        body4 = body4 + " " + str(x.iloc[i]['Explanation for question 11 here'])
    elif x.iloc[i]['labels_'+title] == -1:
        body4 = body4 + " " + str(x.iloc[i]['Explanation for question 11 here'])

result1 = model(body1,num_sentences=5)
result2 = model(body2,num_sentences=5)
result3 = model(body3,num_sentences=5)
result4 = model(body4,num_sentences=5)

file = open("temp_summaries_"+title+".txt", "w")
file.write("SUMMARY 0\n")
file.write(result1 +"\n\n")
file.write("SUMMARY 1\n")
file.write(result2 +"\n\n")
file.write("SUMMARY 2\n")
file.write(result3 +"\n\n")
file.write("SUMMARY 3/-1\n")
file.write(result4 +"\n\n")
file.close()

# Summary Analysis - Kappa Score

In [None]:
#assigning a score to each Summary
assigned_score = []
for i in x['labels_'+title]:
    #print(i)
    if i == 0:
        assigned_score.append(0.5)
    elif i == 1:
        assigned_score.append(1.5)
    elif i == 2:
        assigned_score.append(1)
    else:
        assigned_score.append(0)

x[title+'_assigned_score'] = assigned_score

In [None]:
#find kappa score to assess if the assigned scores agree with human evaluated scores
from sklearn.metrics import cohen_kappa_score

cohen_kappa_score(df_cleaned_t1['Assigned Points - Ranjani'].astype("str"), df_cleaned_t1['Rationale(1.5)'].astype("str")) 