# Blabber Cleaning
#### (By: Mark Ehab Aziz)
#### (Built Under: Python 3.11.4)
Filtering out and cleaning text data.
As tasked inside the 'to do.txt'

In [10]:
# Importing Libraries
import pandas as pd     # Loading Data
import re
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize     # To Tokenize Words
from nltk.stem import PorterStemmer         # Stemming words
from nltk.stem import WordNetLemmatizer     # Lemmatizing words

In [11]:
# Loading data into kernel
# Using two methods (As stated in my previous projects)
# 1. Path working within my git repo
blab = pd.read_csv("../dataset/train.csv")

# 2. Path when data is within the same folder
#blab = pd.read_csv("./train.csv")

# Data Exploration
Using `.head(n)` to show the first $n^{th}$ rows of the dataset.

In [12]:
# Defining n rows to see
n = 5

# Showing head
blab.head(n)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


As stated by our todo list, we are only tasked with cleaning of the text, so we'll be focusing on `comment_text`.

Referring to our todo list once again, we will be dropping `id`, `toxic`, `severe_toxic`, `obscene`, `threat`, `insult`, and `identity_hate`; as we are not concerned with classifying the sentiment or the meaning behind any of the comments.

Reminder for what to be done:
- Read Text
- Clean Text (Capitalisation, punctuation)
- Remove Stop Words
- Tokenization
- Stemming

Under no aforementioned task will we be using the columns I have mentioned to drop.

In [13]:
# Defining list of columns to be dropped
col_droppable = ["id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# Dropping
txt_blab = blab.drop(columns = col_droppable)

# Viewing
txt_blab.head(n)

Unnamed: 0,comment_text
0,Explanation\r\nWhy the edits made under my use...
1,D'aww! He matches this background colour I'm s...
2,"Hey man, I'm really not trying to edit war. It..."
3,"""\r\nMore\r\nI can't make any real suggestions..."
4,"You, sir, are my hero. Any chance you remember..."


In [14]:
# Removing '\n' '\r' '\t' from every line-kinda
txt_blab.replace(r'[\r\n\t]', ' ', regex = True, inplace=True)

# Removing URLs (Standard URL Scheme, There still exist instances
# of just 'https' or 'http' randomly written, they will just be
# treated like normal words and tokenized as the rest)
txt_blab.replace(r'http://\S+|https://\S+', '', regex = True, inplace = True)

# Cleaning Above Sentences
Using NLTK and Regex.

In [16]:
# Defining Regex patterns
# Match words starting with Uppercase letters
upper_words = r"([A-Z])\w+"

# Match Words that start with either Upper/lowercase letters
upper_lower_words = r"[A-Za-z]\w+"

# Match Emails
email_pattern = r"\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b"

# Match URLs
url_pattern = r"(http|ftp|https):\/\/([\w+?\.\w+])+([a-zA-Z0-9\~\!\@\#\$\%\^\&\*\(\)_\-\=\+\\\/\?\.\:\;\'\,]*)?"

In [17]:
# Instantiate a bag of words list, to hold tokens of each word
# Probably better to use a dictionary if we care about count???
# Still gotta access every token and change to lower
# Also still gotta make it so that i can stem and remove some other special characters
# Progress?????
token_per_row = []

# Start and finish indecies of iterator
# Bound to become the length of the file eventually
# I really dispise this
for i in range(0, 50):
    # Grab string fully from dataframe
    line = txt_blab.iloc[i,0]

    # Append list of tokens
    token_per_row.append(regexp_tokenize(line, upper_lower_words))

In [18]:
# Lowercase every word