In [56]:
import pandas as pd
import re
import spacy
import gensim.downloader as api

nlp = spacy.load("en_core_web_sm")

In [57]:
data = pd.read_csv("../YouTube_Datasets/annotation.csv")

In [58]:
data = data.drop(data.columns[[0, 1, 2, 3, 4, 5]], axis=1)

In [59]:
data = data.dropna()
data = data.drop_duplicates()
data = data.reset_index(drop=True)


In [60]:
data['label'].value_counts()

label
0.0    792
1.0     71
Name: count, dtype: int64

In [61]:
data['label'] = data['label'].astype(int)

### Checking inconsistencies in the data overlap (25%)

In [62]:
duplicate_inconsistencies = data.groupby('comment_text')['label'].nunique() > 1

conflicting_duplicates = data[data['comment_text'].isin(duplicate_inconsistencies[duplicate_inconsistencies].index)]

print(conflicting_duplicates)

Empty DataFrame
Columns: [comment_text, label]
Index: []


#### Made sure that there are no inconsistencies in annotation overlap

### Discarding the duplicate overlaps that are used for validation

In [63]:
data = data.drop_duplicates()

### Function to clean text


In [64]:
def clean_text(text):
    if pd.isnull(text) or not isinstance(text, str):
        return ""
    text = re.sub(r"[^a-zA-Z\s]", "", text) 
    text = re.sub(r"\s+", " ", text).strip().lower() 
    return text

### Tokenization, lemmatization, and stopword removal using spaCy


In [65]:
def preprocess_text(text):
    doc = nlp(text)  
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha] 
    return " ".join(tokens)

In [66]:
data['Cleaned_Content'] = data['comment_text'].apply(clean_text)

data['Preprocessed_content'] = data['Cleaned_Content'].apply(preprocess_text)

In [67]:
data['Label'] = data['label']
data.drop(['label'], axis=1, inplace=True)

In [68]:
data.to_csv("../YouTube_Datasets/preprocessed_youtube.csv", index=False)