This notebook has two parts. First, the data is cleaned using standard procedures, pre-processing, removing urls, expanding contractions, fixing misspellings, etc. Secondly, Second, the data is cleaned using custom stopwords (which were identified my manual observation using pyLDAvis). The final version of our research (for LDA) uses both parts of the cleaning pipeline.

# Imports, Installs, and Downloads

In [None]:
!pip install gensim



In [None]:
from gensim.corpora import Dictionary
from gensim.models import LsiModel, Phrases, LdaModel, TfidfModel
from gensim.utils import simple_preprocess
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import spacy
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Load your Data

Method 1: Download it directly from the FYP Folder

In [None]:
!gdown 1NTlDJB5HUKgQHyTo1aRjn6dUvkK42jD6
df = pd.read_csv("Pets_submission.csv")

Downloading...
From: https://drive.google.com/uc?id=1NTlDJB5HUKgQHyTo1aRjn6dUvkK42jD6
To: /content/Pets_submission.csv
100% 57.5M/57.5M [00:00<00:00, 127MB/s]


Method 2: Mount Drive and Load a CSV File there

In [None]:
file_path = "train-test-data/test-raw.csv"
df = pd.read_csv(f"/content/drive/MyDrive/FYP/Data/{file_path}")

Method 3: Load the Reddit-All-18.zip File, to extract a particular .csv file from there

In [None]:
!gdown 1-7PQFV7hnpVbaHIArJExOJ7FfR5Vk2mA
!unzip Reddit-All-18-raw.zip

Downloading...
From: https://drive.google.com/uc?id=1-7PQFV7hnpVbaHIArJExOJ7FfR5Vk2mA
To: /content/Reddit-All-18-raw.zip
100% 441M/441M [00:07<00:00, 59.2MB/s]
Archive:  Reddit-All-18-raw.zip
  inflating: MuslimMarriage_submission.csv  
  inflating: science_submission.csv  
  inflating: sports_submission.csv   
  inflating: hinduism_submission.csv  
  inflating: gaming_submission.csv   
  inflating: technology_submission.csv  
  inflating: Hijabis_submission.csv  
  inflating: food_submission.csv     
  inflating: travel_submission.csv   
  inflating: personalfinance_submission.csv  
  inflating: Christianity_submission.csv  
  inflating: television_submission.csv  
  inflating: atheism_submission.csv  
  inflating: AskReddit_submission.csv  
  inflating: exmuslim_submission.csv  
  inflating: Fitness_submission.csv  
  inflating: islam_submission.csv    
  inflating: Muslim_submission.csv   


In [None]:
df = pd.read_csv("/content/Christianity_submission.csv")

# Data Cleaning

In [None]:
df

Unnamed: 0,document,subreddit,class
0,I was just talking to an elderly Italian woman...,food,0
1,A few weeks back someone posted a chocolate ca...,food,0
2,* 1 16 oz package spaghetti noodles or angel h...,food,0
3,"I had some friends over this weekend, and we d...",food,0
4,* Serves six.\n* Prep Time—30 minutes (five wi...,food,0
...,...,...,...
109876,Abu Hurairah (RA) reported Allah's Messenger (...,Muslim,1
109877,My mother celebrates pagan holidays (solstices...,Muslim,1
109878,"ASAK, I'm (18) brought to the limit. All I'm g...",Muslim,1
109879,I came here because I wanted to read the Quran...,Muslim,1


In [None]:
print("Number of rows before:", df.shape[0])
df=df.drop_duplicates().dropna().reset_index(drop=True)
print("Number of rows after dropping duplicates and nulls:", df.shape[0])

Number of rows before: 109881
Number of rows after dropping duplicates and nulls: 109881


In [None]:
contractions = {
    r"ain't": "am not",
    r"aren't": "are not",
    r"can't": "cannot",
    r"could've": "could have",
    r"couldn't": "could not",
    r"didn't": "did not",
    r"doesn't": "does not",
    r"don't": "do not",
    r"hadn't": "had not",
    r"hasn't": "has not",
    r"haven't": "have not",
    r"he'd": "he would",
    r"he'll": "he will",
    r"he's": "he is",
    r"how'd": "how did",
    r"how'll": "how will",
    r"how's": "how is",
    r"i'd": "I would",
    r"i'll": "I will",
    r"i'm": "I am",
    r"i've": "I have",
    r"isn't": "is not",
    r"it'd": "it would",
    r"it'll": "it will",
    r"it's": "it is",
    r"let's": "let us",
    r"mightn't": "might not",
    r"mustn't": "must not",
    r"shan't": "shall not",
    r"she'd": "she would",
    r"she'll": "she will",
    r"she's": "she is",
    r"should've": "should have",
    r"shouldn't": "should not",
    r"that's": "that is",
    r"that'd": "that would",
    r"there's": "there is",
    r"they'd": "they would",
    r"they'll": "they will",
    r"they're": "they are",
    r"they've": "they have",
    r"wasn't": "was not",
    r"we'd": "we would",
    r"we'll": "we will",
    r"we're": "we are",
    r"we've": "we have",
    r"weren't": "were not",
    r"what'll": "what will",
    r"what're": "what are",
    r"what's": "what is",
    r"what've": "what have",
    r"where's": "where is",
    r"who'd": "who would",
    r"who'll": "who will",
    r"who're": "who are",
    r"who's": "who is",
    r"who've": "who have",
    r"won't": "will not",
    r"would've": "would have",
    r"wouldn't": "would not",
    r"you'd": "you would",
    r"you'll": "you will",
    r"you're": "you are",
    r"you've": "you have"
}

quran_spellings = [
    r'qur-an/qur-an-guidan',
    r'qurahn',
    r'qurin',
    r'quraan',
    r'qur’ān',
    r'qurnayn',
    r'qur’an/sunn',
    r'qurân',
    r'qur’aan',
    r'qurʾan',
    r'qurâ€™an',
    r'qurän',
    r'qur;an',
    r'qurʻán',
    r'quruan',
    r'qurʼān',
    r'qur-an',
    r'qurr-on',
    r'qur?an',
    r'quraaniyyoon',
    r'qurran',
    r'qur1an',
    r'quraun',
    r'quraniyyun',
    r'qurýan',
    r'quraaan',
    r'quraaniyoon',
    r'qurʼan',
    r'qur‘ān',
    r'qur`ân',
    r'qur`an',
    r'qurann',
    r'qur`ān',
    r'qurān',
    r'qur’aniyyoon',
    r'qur\x01an',
    r'qur%27an',
    r'quràan',
    r'qura’n',
    r'qur´an',
    r'quranen',
    r'qur‘an',
    r'qur’an',
    r'qurani/quran',
    r'quranan',
    r'quraniyoon',
    r'quran(an',
    r'quran/kuran',
    r'qurʾān',
    r'qur"an',
    r'quraydhan',
    r'quran-an',
    r'quran_an',
    r'qur\\`an'
]

for spelling in quran_spellings:
    contractions[spelling] = "quran"

In [None]:
def clean(doc):
    # Make all letters lowercase
    doc=doc.lower()

    # Remove URLs and emails
    doc = re.sub(r'\S*://\S*', ' ' , doc)
    doc = re.sub(r'www\.\S*', ' ' , doc)
    doc = re.sub(r'\S*\.com\S*', ' ' , doc)
    doc = re.sub(r'\S*\.gov\S*', ' ' , doc)
    doc = re.sub(r'\S*\.org\S*', ' ' , doc)
    doc = re.sub(r'\S*\.net\S*', ' ' , doc)
    doc = re.sub(r'\S*\.co\.\S*', ' ' , doc)
    doc = re.sub(r'\S*\.lib\.\S*', ' ' , doc)
    doc = re.sub(r'\S*@\S*', ' ' , doc)

    # Remove the s from possessive nouns
    doc = re.sub(r'\S+\'s', '', doc)

    # expand contractions and also normalise the spelling of quran
    # also adds a space before and after the replaced word
    pattern = r'(' + '|'.join(re.escape(contraction) for contraction in contractions.keys()) + r')'
    doc = re.sub(pattern, lambda x: ' ' + contractions[x.group()] + ' ', doc)

    #Might be useful to normalise the arabic words with apostrophes, it MUST be ran after fixing contractions
    doc = re.sub(r'\'', '', doc)

    # replace every punctuation mark, special character, whitespace character and digit with a space
    doc = re.sub(r'[^a-zA-Z]', ' ', doc)

    doc = nlp(doc)
    # remove stopwords and lemmatize the tokens
    filtered_tokens = [token.lemma_ for token in doc if not token.is_stop]
    # join the filtered tokens back into a document/string
    filtered_document = ' '.join(filtered_tokens)
    # # remove single/double lettered words
    filtered_document = re.sub(r'\b\w{1,2}\b', '', filtered_document)
    # remove any excess whitespace
    filtered_document = re.sub(r'\s+', ' ', filtered_document).strip()
    return filtered_document

In [None]:
import copy, multiprocessing

df2 = copy.deepcopy(df)
with multiprocessing.Pool(processes=2) as pool:
    df2['document'] = pool.map(clean, df['document'])

df2.replace('', np.nan, inplace=True)
df2 = df2.dropna().reset_index(drop=True)

# Save the cleaned dataset
df2.to_csv("clean.csv", index=False)

In [None]:
df = pd.read_csv("/content/clean.csv")
df

# Extra Data Cleaning

In [None]:
custom_stopwords = {"feel", "come", "thing", "think", "want", "don", "like",
                    "know", "good", "feel", "ask", "com", "say", "way",
                    "find", "tell", "year", "help", "try", "amp", "gt", "https", "look",
                    "jippe", "jireen", "jirfvfe", "jiveprinte", "jjfsdmhxzbf", "jjdhf",
                    "zzz", "see", "www", "get", "http", "comment", "nbsp", "net", "pdf",
                    "org", "post", "app", "link", "jitxi", "watch", "zzzcb", "thc", "edu",
                    "people", "non", "render", "use", "itune", "detail", "bring", "store",
                    "etc", "let", "wiki", "wikipedia", "didn", "give",
                    "lot", "let", "not", "sure", "png", "audio", "thy",
                    "jpg", "webp", "kjv", "ksu", "series", "unto", "cuz",
                    "laa", "happen", "TRUE", "ago", "yes", "soon", "till", "got", "actually",
                    "html", "hey", "php", "thou", "preview", "lil", "doesn", "have", "anymore",
                    "reddit", "video", "youtube", "amazon", "islamqa", "facebook", "youtuber",
                    "episode", "youtu", "redd", "imgur", "islamawakene", "islamawakened",
                    "medium", "discord", "thread", "twitter", "podcast", "subreddit", "edit",
                    "server", "pjpg", "afaik", "idk", "org", "sub", "purposely", "utm",
                    "zoom", "sim", "thee", "mod", "respectively", "podbean", "utf", "xkuh",
                    "wordpress", "lest", "htm", "user", "gets", "yeah", "lol", "yah"
                    "mean", "take", "need", "make", "consider", "ugh"}

def remove_custom_stopwords(document, custom_stopwords):
    tokens = word_tokenize(document)
    filtered_tokens = [t for t in tokens if t not in custom_stopwords]
    filtered_document = ' '.join(filtered_tokens)
    return filtered_document

df['document'] = df['document'].apply(remove_custom_stopwords, custom_stopwords=custom_stopwords)

print("Number of rows before:", df.shape[0])
df=df.drop_duplicates().dropna().reset_index(drop=True)
print("Number of rows after dropping duplicates and nulls:", df.shape[0])

Number of rows before: 109848
Number of rows after dropping duplicates and nulls: 109573


In [None]:
df.to_csv(f"clean-customstopwords.csv", index=False)

In [None]:
df = pd.read_csv(f"clean-customstopwords.csv")

In [None]:
df

Unnamed: 0,document,subreddit,class
0,talk elderly italian woman dish mother cook po...,food,0
1,week chocolate cake mug immediately intrigue p...,food,0
2,package spaghetti noodle angel hair ground bee...,food,0
3,friend weekend bit cooking frustrated disorgan...,food,0
4,serve prep time minute vegetable mandolin cook...,food,0
...,...,...,...
109564,abu hurairah report messenger peace offer pray...,Muslim,1
109565,mother celebrate pagan holiday solstice yule c...,Muslim,1
109566,asak limit option parent basically stop care b...,Muslim,1
109567,read quran figure question maybe million quest...,Muslim,1


In [None]:
df = df.drop(13069)

In [None]:
df=df.drop_duplicates().dropna().reset_index(drop=True)

In [None]:
df[df.isna().any(axis=1)]

Unnamed: 0,document,subreddit,class
