In [39]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

The dataset to be used in this project is a modified version of the dataset create by Paul Clough and Mark Stevenson. The complete description of the data generation process is described in their [research article](https://link.springer.com/article/10.1007/s10579-009-9112-1) (Clough, P., Stevenson, M. Developing a corpus of plagiarised short answers, 2011)

### Description of the dataset

<li> The dataset contains several txt files whose characteristics are summarized in the file_information.csv file. </li>
<li> The dataset has 100 files, out of which 5 are the original answers. Therefore, the participants contributed with 95 anwers, divided in 5 tasks and 5 plagiarism methods. </li>
<li> The <b>File</b> column in the file_information.csv file contains the name of the txt file. </li>
<li> The <b>Task</b> column contains one of the five learning task (A-E) that each txt answers. </li>
<li> The <b>Category</b> column indicates if the participant was asked to use a Near copy (cut), Light revision (light), Heavy revision (heavy) or Non-plagiarised (non) method to answer the question. This column also contains the 'orig' category to reference the original texts on which participants based their answers</li>

In [40]:
data_dir = Path(r"C:/Users/snehi/Documents/CTDS/data")

#load metadata
summary_data = pd.read_csv(data_dir / "file_information.csv")
summary_data.head()

Unnamed: 0,File,Task,Category
0,g0pA_taska.txt,a,non
1,g0pA_taskb.txt,b,cut
2,g0pA_taskc.txt,c,light
3,g0pA_taskd.txt,d,heavy
4,g0pA_taske.txt,e,non


In [41]:
#this is very useful later for evaluating how well the similarity or clustering methods separate plagiarised vs non

#mapping 5-category label to numeric id
category_to_id = {'non': 0, 'heavy': 1, 'light': 2, 'cut': 3, 'orig': -1}
summary_data['category_id'] = summary_data['Category'].map(category_to_id)

#binary plagiarism label: 1 = plagiarised (cut / light / heavy), 0 = non, -1 = original
def to_plag_label(cat):
    if cat == 'non':
        return 0
    if cat == 'orig':
        return -1
    return 1

summary_data['plag_label'] = summary_data['Category'].apply(to_plag_label)

summary_data.head()


Unnamed: 0,File,Task,Category,category_id,plag_label
0,g0pA_taska.txt,a,non,0,0
1,g0pA_taskb.txt,b,cut,3,1
2,g0pA_taskc.txt,c,light,2,1
3,g0pA_taskd.txt,d,heavy,1,1
4,g0pA_taske.txt,e,non,0,0


In [42]:
#preprocessing functions

#load raw text and create a cleaned version
def read_file_text(path):
    #read file as raw text for sentence embeddings
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def clean_text(text):
    #basic cleaning for tfidf and hashing - convert text to lowercase, remove punctuation, and tidy the whitespace
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)   #remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  #normalising whitespace
    return text

#raw text keeps punctuation - for sentence embeddings
summary_data["raw_text"] = summary_data["File"].apply(lambda fn: read_file_text(data_dir / fn))

#clean text - for tfidf and hashing
summary_data["clean_text"] = summary_data["raw_text"].apply(clean_text)

#save to csv
summary_data.to_csv(data_dir / "preprocessed_dataset.csv", index=False)

summary_data.head()


Unnamed: 0,File,Task,Category,category_id,plag_label,raw_text,clean_text
0,g0pA_taska.txt,a,non,0,0,Inheritance is a basic concept of Object-Orien...,inheritance is a basic concept of object orien...
1,g0pA_taskb.txt,b,cut,3,1,PageRank is a link analysis algorithm used by ...,pagerank is a link analysis algorithm used by ...
2,g0pA_taskc.txt,c,light,2,1,"The vector space model (also called, term vect...",the vector space model also called term vector...
3,g0pA_taskd.txt,d,heavy,1,1,Bayes’ theorem was names after Rev Thomas Baye...,bayes theorem was names after rev thomas bayes...
4,g0pA_taske.txt,e,non,0,0,Dynamic Programming is an algorithm design tec...,dynamic programming is an algorithm design tec...
