# Blabber Cleaning
#### (By: Mark Ehab Aziz)
#### (Built Under: Python 3.11.4)
Filtering out and cleaning text data.
As tasked inside the 'to do.txt'

In [65]:
# Importing Libraries
import pandas as pd     # Loading Data
import re
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize     # To Tokenize Words
from nltk.stem import PorterStemmer         # Stemming words
from nltk.stem import WordNetLemmatizer     # Lemmatizing words

In [66]:
# Loading data into kernel
# Using two methods (As stated in my previous projects)
# 1. Path working within my git repo
blab = pd.read_csv("../dataset/train.csv")

# 2. Path when data is within the same folder
#blab = pd.read_csv("./train.csv")

# Data Exploration
Using `.head(n)` to show the first $n^{th}$ rows of the dataset.

In [67]:
# Defining n rows to see
n = 5

# Showing head
blab.head(n)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


As stated by our todo list, we are only tasked with cleaning of the text, so we'll be focusing on `comment_text`.

Referring to our todo list once again, we will be dropping `id`, `toxic`, `severe_toxic`, `obscene`, `threat`, `insult`, and `identity_hate`; as we are not concerned with classifying the sentiment or the meaning behind any of the comments.

Reminder for what to be done:
- Read Text
- Clean Text (Capitalisation, punctuation)
- Remove Stop Words
- Tokenization
- Stemming

Under no aforementioned task will we be using the columns I have mentioned to drop.

In [68]:
# Defining list of columns to be dropped
col_droppable = ["id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# Dropping
txt_blab = blab.drop(columns = col_droppable)

# Viewing
txt_blab.head(n)

Unnamed: 0,comment_text
0,Explanation\r\nWhy the edits made under my use...
1,D'aww! He matches this background colour I'm s...
2,"Hey man, I'm really not trying to edit war. It..."
3,"""\r\nMore\r\nI can't make any real suggestions..."
4,"You, sir, are my hero. Any chance you remember..."


In [69]:
# Removing '\n' '\r' '\t' from every line-kinda
txt_blab.replace(r'[\r\n\t]', ' ', regex = True, inplace=True)

# Cleaning Above Sentences
Using NLTK and Regex.

In [87]:
# Defining Regex patterns
# Match words starting with Uppercase letters
upper_words = r"([A-Z])\w+"

# Match Words that start with either Upper/lowercase letters
upper_lower_words = r"[A-Za-z]\w+"

# Match Emails
email_pattern = r"\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b"

# Match URLs
url_pattern = r"(http|ftp|https):\/\/([\w+?\.\w+])+([a-zA-Z0-9\~\!\@\#\$\%\^\&\*\(\)_\-\=\+\\\/\?\.\:\;\'\,]*)?"

In [88]:
bag_o_words = []

for i in range(0, 5):
    line = txt_blab.iloc[i].to_string(header = False, index = False)
    print(line)

    # This is important, it stops it from breaking, pls no remov xd
    # No seriously, it removes the index newline
    line = line.split('\\n')

    bag_o_words.append([regexp_tokenize(word, upper_lower_words) for word in line])

Explanation  Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
"  More  I can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or wa

In [89]:
bag_o_words

[[['Explanation',
   'Why',
   'the',
   'edits',
   'made',
   'under',
   'my',
   'username',
   'Hardcore',
   'Metallica',
   'Fan',
   'were',
   'reverted',
   'They',
   'weren',
   'vandalisms',
   'just',
   'closure',
   'on',
   'some',
   'GAs',
   'after',
   'voted',
   'at',
   'New',
   'York',
   'Dolls',
   'FAC',
   'And',
   'please',
   'don',
   'remove',
   'the',
   'template',
   'from',
   'the',
   'talk',
   'page',
   'since',
   'retired',
   'now']],
 [['aww',
   'He',
   'matches',
   'this',
   'background',
   'colour',
   'seemingly',
   'stuck',
   'with',
   'Thanks',
   'talk',
   'January',
   'UTC']],
 [['Hey',
   'man',
   'really',
   'not',
   'trying',
   'to',
   'edit',
   'war',
   'It',
   'just',
   'that',
   'this',
   'guy',
   'is',
   'constantly',
   'removing',
   'relevant',
   'information',
   'and',
   'talking',
   'to',
   'me',
   'through',
   'edits',
   'instead',
   'of',
   'my',
   'talk',
   'page',
   'He',
   'see