In [13]:
import csv
import nltk
import pandas as pd
import re

from nltk.tokenize import word_tokenize
from tqdm import tqdm

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/osama/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
# Regex patterns

yta_regex = r'\byta\b'
yta_pattern = re.compile(yta_regex, flags=0)

esh_regex = r'\besh\b'
esh_pattern = re.compile(esh_regex, flags=0)

nta_regex = r'\bnta\b'
nta_pattern = re.compile(nta_regex, flags=0)

nah_regex = r'\bnah\b'
nah_pattern = re.compile(nah_regex, flags=0)  

remove_edit_regex = r'edit:.*|update:.*'
remove_edit_pattern = re.compile(remove_edit_regex, flags=0)

newline_regex = r'(\r\n)+|\r+|\n+|\t+'
newline_pattern = re.compile(newline_regex, flags=0)

In [4]:
# Helper functions
def removeTextEdit(text): 
    return remove_edit_pattern.sub("", text)

def removeNewlineChars(text):
    return newline_pattern.sub(" ", text)

def getCommentVerdict(comment):

    comment = newline_pattern.sub(" ", comment)

    if yta_pattern.search(comment):
        return "yta"
    elif nta_pattern.search(comment):
        return "nta"
    elif esh_pattern.search(comment):
        return "esh"
    elif nah_pattern.search(comment):
        return "nah"
    else:
        return None

In [14]:
# Read file
df = pd.read_csv('/Users/osama/Documents/finalNlu/data/dataPreProcessing/pre_processed_dataset.csv')

# Prepare write file
file = open('cleaned_dataset.csv',"w",encoding="utf-8", newline="") 
writer = csv.writer(file, quoting=csv.QUOTE_ALL)
writer.writerow(["id", "title", "text", "verdict", "comment1",  "comment2", "score"])

# Iterate over file
counter = 0
yta_count = 0
nta_count = 0

text_sizes = []
comment1_sizes = []
comment2_sizes = []

for index, row in tqdm(df.iterrows()):

    id = row['id']
    title = row['title'].lower()
    text = row['text'].lower()
    edited = row['edited']
    verdict = row['verdict']
    comment1 = row['comment1'].lower()
    comment2 = row['comment2'].lower()
    comment3 = str(row['comment3']).lower()
    comment4 = str(row['comment4']).lower()
    comment5 = str(row['comment5']).lower()
    comment6 = str(row['comment6']).lower()
    comment7 = str(row['comment7']).lower()
    comment8 = str(row['comment8']).lower()
    comment9 = str(row['comment9']).lower()
    comment10 = str(row['comment10']).lower()
    score = row['score']
    url = row['url']
    time_created = row["time_created"]
    
    # If not edited the value is "False" else it is a timestamp
    if edited != "False":
        text = removeTextEdit(text)

    comment1 = removeTextEdit(comment1)
    comment2 = removeTextEdit(comment2)

    # Remove newline characters
    title = removeNewlineChars(title)
    text = removeNewlineChars(text)
    comment1 = removeNewlineChars(comment1)
    comment2 = removeNewlineChars(comment2)

    # Update verdict based on comment verdict
    verdict1 = getCommentVerdict(comment1)
    verdict2 = getCommentVerdict(comment2)

    # Make sure verdict of top 2 comments are equal
    if verdict1 == None or verdict2 == None or verdict2 != verdict1:
        continue

    # Update verdict with verdict of top 2 comments
    verdict = verdict1


    # Check length and filter by length
    text_size = len(word_tokenize(text))
    comment1_size = len(word_tokenize(comment1))
    comment2_size = len(word_tokenize(comment2))

    # Minimum number of tokens for a comment and post
    min_comment_tokens = 5
    min_post_tokens = 10

    # Skip posts with comments shorter than min_comment_tokens
    if comment1_size < min_comment_tokens or  comment2_size < min_comment_tokens:
        continue

    # Skip posts with length shorter than min_comment_tokens
    if text_size < min_post_tokens:
        continue

    row = [id, title, text, verdict, comment1, comment2, score]

    writer.writerow(row)

file.close()


329349it [08:39, 633.39it/s]
