# Text Preprocessing and Cleaning

### 1.1 Tokenization and Normalization

In [1]:
from nltk.tokenize import word_tokenize

text = 'Hello, my name is Hamad, and i am an instructor of DS Bootcamp!'

tokens = word_tokenize(text.lower())

print(tokens)

['hello', ',', 'my', 'name', 'is', 'hamad', ',', 'and', 'i', 'am', 'an', 'instructor', 'of', 'ds', 'bootcamp', '!']


### 1.2 Why Tokenization Instead of Normal Split?

In [2]:
from nltk.tokenize import word_tokenize

text = 'Hello, my name is Hamad!'

split = text.lower().split(' ')

print(f'Using Python Split Method :\n{split}\n')

tokens = word_tokenize(text.lower())

print(f'Using NLTK word_tokenize  :\n{tokens}')

Using Python Split Method :
['hello,', 'my', 'name', 'is', 'hamad!']

Using NLTK word_tokenize  :
['hello', ',', 'my', 'name', 'is', 'hamad', '!']


### 2 Sentence Segmentation

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize

document = "Mr. Hamad is the best. U.S.A is a north American country."
sentences = sent_tokenize(document)

print(f'The sentences of the document :\n{sentences}\n')

for i, sent in enumerate(sentences):
    tokens = word_tokenize(sent.lower())
    print(f'The sentence {i+1} tokens : {tokens}')

The sentences of the document :
['Mr. Hamad is the best.', 'U.S.A is a north American country.']

The sentence 1 tokens : ['mr.', 'hamad', 'is', 'the', 'best', '.']
The sentence 2 tokens : ['u.s.a', 'is', 'a', 'north', 'american', 'country', '.']


### 3.1 Stemming and Lemmatization : Example One

In [4]:
import nltk
from nltk.corpus import udhr
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download("udhr")

udhr_data = udhr.sents('English-Latin1')

tokens = udhr_data[0][:10]
print(f'The tokens before stemming and lemmatization :-\n{tokens}')
print()

stemmer = PorterStemmer()
stems = [stemmer.stem(token.lower()) for token in tokens]
print(f'The tokens after stemming :-\n{stems}')
print()

lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(token.lower()) for token in tokens]
print(f'The tokens after lemmatization :-\n{lemmas}')

[nltk_data] Downloading package udhr to
[nltk_data]     C:\Users\Hamad\AppData\Roaming\nltk_data...
[nltk_data]   Package udhr is already up-to-date!


The tokens before stemming and lemmatization :-
['Universal', 'Declaration', 'of', 'Human', 'Rights', 'Preamble', 'Whereas', 'recognition', 'of', 'the']

The tokens after stemming :-
['univers', 'declar', 'of', 'human', 'right', 'preambl', 'wherea', 'recognit', 'of', 'the']

The tokens after lemmatization :-
['universal', 'declaration', 'of', 'human', 'right', 'preamble', 'whereas', 'recognition', 'of', 'the']


### 4 Regular Expressions

In [5]:
import re

text = "Dude!! check this video, it is the funniest video ever " \
        "https://www.youtube.com/watch?v=MN-cmJ7T1Do"
print(f'The original text :\n{text}\n')

s = re.sub('http://\S+|https://\S+', '', text)

print(f'The text after using RE :\n{s}')

The original text :
Dude!! check this video, it is the funniest video ever https://www.youtube.com/watch?v=MN-cmJ7T1Do

The text after using RE :
Dude!! check this video, it is the funniest video ever 


### 5.1 Remove Stop Words

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

text = 'Hamad is an assistant instructor at Tuwaiq Academy.'
tokens = word_tokenize(text)

print(f'The tokens before removing stop words :\n{tokens}\n')

stop_words = set(stopwords.words('english'))

filtered_tokens = [token for token in tokens if not token in stop_words]

print(f'The tokens after removing stop words :\n{filtered_tokens}')

The tokens before removing stop words :
['Hamad', 'is', 'an', 'assistant', 'instructor', 'at', 'Tuwaiq', 'Academy', '.']

The tokens after removing stop words :
['Hamad', 'assistant', 'instructor', 'Tuwaiq', 'Academy', '.']


### 5.2 Remove Punctuation Marks

In [7]:
from nltk.tokenize import word_tokenize

text = "I can't belief that NLP is to easy."

tokens = word_tokenize(text)

print(f'Tokens before removing punctuation marks :\n{tokens}\n')

tokens = [token for token in tokens if token.isalpha()]

print(f'Tokens before removing punctuation marks :\n{tokens}')

Tokens before removing punctuation marks :
['I', 'ca', "n't", 'belief', 'that', 'NLP', 'is', 'to', 'easy', '.']

Tokens before removing punctuation marks :
['I', 'ca', 'belief', 'that', 'NLP', 'is', 'to', 'easy']


# Generalized Text Preprocessing Function

In [8]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')
import string

def process_text(text):
    
    # Here before tokenization, you can remove starts by text cleaning.
    
    tokens = word_tokenize(text)
    
    ps = PorterStemmer()
    
    final_text = []
    for token in tokens:
        if (token not in stopwords_english and token not in string.punctuation):
            stem_token = ps.stem(token)
            final_text.append(stem_token)

    return final_text

# Numerical Representations

### 1 Count Vectorizer (Bag of Words)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

original_documents = [
    'Hello, World!',
    'Hamad is the Best',
    'I am the Danger!'
]

preprocessed_documents = []
for i, text in enumerate(original_documents):
    preprocessed_documents.append(' '.join(process_text(text)))

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(preprocessed_documents)

count_vectorized = pd.DataFrame(X.toarray(), index=original_documents, columns=vectorizer.get_feature_names_out())
count_vectorized

Unnamed: 0,best,danger,hamad,hello,world
"Hello, World!",0,0,0,1,1
Hamad is the Best,1,0,1,0,0
I am the Danger!,0,1,0,0,0


### 2 TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

original_documents = [
    'Hello, World!',
    'Hamad is the Best',
    'I am the Danger!',
    'Hello people!',
    'Do not smoke'
]

preprocessed_documents = []
for i, text in enumerate(original_documents):
    preprocessed_documents.append(' '.join(process_text(text)))

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_documents)

count_vectorized = pd.DataFrame(X.toarray(), index=original_documents, columns=vectorizer.get_feature_names_out())
count_vectorized

Unnamed: 0,best,danger,do,hamad,hello,peopl,smoke,world
"Hello, World!",0.0,0.0,0.0,0.0,0.627914,0.0,0.0,0.778283
Hamad is the Best,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0
I am the Danger!,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
Hello people!,0.0,0.0,0.0,0.0,0.627914,0.778283,0.0,0.0
Do not smoke,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0


### 3 Positive and Negative Frequency

In [11]:
corpus = [
    'i am happy because i am learning NLP',
    'i love you so much much',
    'i am Sad because i am not learning NLP',
    'i hate you so much'
]

labels = [1, 1, 0, 0]

#### 3.1 Vocabulary and Count frequencies

In [12]:
def count_frequencies(corpus, labels):
    freqs = {}

    for text, y in zip(corpus, labels):

        cleaned_text = process_text(text)

        for token in cleaned_text:
            pair = (token, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    
    return freqs

In [13]:
freqs = count_frequencies(corpus, labels)

for pair in freqs:
    sentiment = 'Positive' if pair[1] == 1 else 'Negative'
    print(f'Sentiment : {sentiment} - Word : {pair[0]} - Appeared : {freqs[pair]}')

Sentiment : Positive - Word : happi - Appeared : 1
Sentiment : Positive - Word : learn - Appeared : 1
Sentiment : Positive - Word : nlp - Appeared : 1
Sentiment : Positive - Word : love - Appeared : 1
Sentiment : Positive - Word : much - Appeared : 2
Sentiment : Negative - Word : sad - Appeared : 1
Sentiment : Negative - Word : learn - Appeared : 1
Sentiment : Negative - Word : nlp - Appeared : 1
Sentiment : Negative - Word : hate - Appeared : 1
Sentiment : Negative - Word : much - Appeared : 1


#### 3.2 Feature Extraction

In [14]:
def extract_features(text, freqs, label):
    
    # Reprocess the text again
    tokens = process_text(text)
    
    dic = {'Text':text, 'Pos':0, 'Neg':0, 'Label':label}
    
    for token in tokens:
        dic['Pos'] += freqs.get((token, 1.0), 0)
        dic['Neg'] += freqs.get((token, 0.0), 0)
    
    return dic

In [15]:
import pandas as pd

df = pd.DataFrame({}, columns=['Text', 'Pos', 'Neg', 'Label'])

for i in range(len(corpus)):
    df.loc[i, :] = extract_features(corpus[i], freqs, labels[i])

df

Unnamed: 0,Text,Pos,Neg,Label
0,i am happy because i am learning NLP,3,2,1
1,i love you so much much,5,2,1
2,i am Sad because i am not learning NLP,2,3,0
3,i hate you so much,2,2,0


#### 3.3 FreqDist

In [16]:
positive_corpus = corpus[:2]
print(positive_corpus)
negative_corpus = corpus[2:]
print(negative_corpus)

['i am happy because i am learning NLP', 'i love you so much much']
['i am Sad because i am not learning NLP', 'i hate you so much']


In [17]:
from nltk.probability import FreqDist

In [18]:
positive_frequencies = []
negative_frequencies = []

In [19]:
for txt in positive_corpus:
    lst = process_text(txt)
    for i in lst:
        positive_frequencies.append(i)

print(positive_frequencies)
print(len(positive_frequencies))

['happi', 'learn', 'nlp', 'love', 'much', 'much']
6


In [20]:
for txt in negative_corpus:
    lst = process_text(txt)
    for i in lst:
        negative_frequencies.append(i)

print(negative_frequencies)
print(len(negative_frequencies))

['sad', 'learn', 'nlp', 'hate', 'much']
5


In [21]:
print(positive_frequencies)
fdist = dict(FreqDist(positive_frequencies))
pos_keys = []
for i in fdist:
    pos_keys.append((i, 1))
    print(i, fdist[i])

positive_frequencies = dict(zip(pos_keys, fdist.values()))
positive_frequencies

['happi', 'learn', 'nlp', 'love', 'much', 'much']
happi 1
learn 1
nlp 1
love 1
much 2


{('happi', 1): 1,
 ('learn', 1): 1,
 ('nlp', 1): 1,
 ('love', 1): 1,
 ('much', 1): 2}

In [22]:
print(negative_frequencies)
fdist = dict(FreqDist(negative_frequencies))
neg_keys = []
for i in fdist:
    neg_keys.append((i, 0))
    print(i, fdist[i])

negative_frequencies = dict(zip(neg_keys, fdist.values()))
negative_frequencies

['sad', 'learn', 'nlp', 'hate', 'much']
sad 1
learn 1
nlp 1
hate 1
much 1


{('sad', 0): 1, ('learn', 0): 1, ('nlp', 0): 1, ('hate', 0): 1, ('much', 0): 1}

In [23]:
freqs2 = dict(zip(
    list(positive_frequencies.keys()) + list(negative_frequencies.keys()),
    list(positive_frequencies.values()) + list(negative_frequencies.values())
))

In [24]:
import pandas as pd

df = pd.DataFrame({}, columns=['Text', 'Pos', 'Neg', 'Label'])

In [25]:
import pandas as pd

df = pd.DataFrame({}, columns=['Text', 'Pos', 'Neg', 'Label'])

for i in range(len(corpus)):
    df.loc[i, :] = extract_features(corpus[i], freqs2, labels[i])

df

Unnamed: 0,Text,Pos,Neg,Label
0,i am happy because i am learning NLP,3,2,1
1,i love you so much much,5,2,1
2,i am Sad because i am not learning NLP,2,3,0
3,i hate you so much,2,2,0
