# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [42]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: c:\Users\vionh\workspace\Data Analysis\data_analytics\Week_11


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vionh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vionh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vionh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vionh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vionh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Defining documents

In [43]:
# Defining documents (=sentenses)
d1 = 'The car is driven on the road.'
d2 = 'The truck is driven on the highway.'
d3 = 'The bicycle is driven on the bicycle path.'
d4 = 'The airplane is flown in the air.'
d5 = 'The formula 1 car is driven on the racetrack.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3  + ' ' + d4 + ' ' + d5
corpus_01

'The car is driven on the road. The truck is driven on the highway. The bicycle is driven on the bicycle path. The airplane is flown in the air. The formula 1 car is driven on the racetrack.'

#### b) new documents defined

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [44]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'the car is driven on the road. the truck is driven on the highway. the bicycle is driven on the bicycle path. the airplane is flown in the air. the formula 1 car is driven on the racetrack.'

### Removing punctuation

In [45]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'the car is driven on the road the truck is driven on the highway the bicycle is driven on the bicycle path the airplane is flown in the air the formula 1 car is driven on the racetrack'

### Tokenize text & removal of stopwords

In [46]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'against', 'we', 'be', 'm', 'haven', 'hers', 'below', 'under', "you've", 'me', 'll', 'yours', 'because', 'all', 'once', "needn't", 'mustn', 'an', "should've", 'between', 'is', 'this', 'doesn', "didn't", 'theirs', 'that', 'a', 'each', 'hadn', 'whom', 'off', 've', "don't", 'those', "weren't", "you're", 'when', 'nor', 'yourselves', "that'll", 'mightn', 'are', 'aren', 'does', 'needn', 'he', 'in', 'by', 'such', 'while', 'these', 'during', "hasn't", "shouldn't", 'other', 'than', "doesn't", 'about', 'herself', 'ourselves', 'with', 'o', "hadn't", 'has', 'no', 'here', 'why', 'after', "wouldn't", 'themselves', 'was', 'd', 'there', 'own', 'she', 'isn', 'for', 'as', 's', 'some', 'who', 'our', 'or', 'so', 'if', 'above', "you'll", 'doing', "wasn't", 'can', 'any', 'where', "couldn't", 'couldn', 'the', 'few', 'don', 'do', "it's", 'weren', 't', 'himself', 'only', 'too', 'up', 'into', 'through', 'will', 'wasn', 'over', 'i', 'more', "you'd", "haven't", 'again', 'am', 'until', 

In [47]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['car', 'driven', 'road', 'truck', 'driven', 'highway', 'bicycle', 'driven', 'bicycle', 'path', 'airplane', 'flown', 'air', 'formula', '1', 'car', 'driven', 'racetrack']

### Lemmatization

In [48]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['car', 'driven', 'road', 'truck', 'driven', 'highway', 'bicycle', 'driven', 'bicycle', 'path', 'airplane', 'flown', 'air', 'formula', '1', 'car', 'driven', 'racetrack'] 

After lemmatization:
['car', 'drive', 'road', 'truck', 'drive', 'highway', 'bicycle', 'drive', 'bicycle', 'path', 'airplane', 'fly', 'air', 'formula', '1', 'car', 'drive', 'racetrack']

## Redefine the text corpus (pre-processed)

In [49]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['car drive road', 
          'truck drive highway', 
          'bicycle drive bicycle path',
          'airplane fly air',
          'formula 1 car drive racetrack']

#### c) corpus adapted using our sentences

## Document-term matrix with ngram_range=(1,1)

In [50]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)

# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   air  airplane  bicycle  car  drive  fly  formula  highway  path  racetrack  \
0    0         0        0    1      1    0        0        0     0          0   
1    0         0        0    0      1    0        0        1     0          0   
2    0         0        2    0      1    0        0        0     1          0   
3    1         1        0    0      0    1        0        0     0          0   
4    0         0        0    1      1    0        1        0     0          1   

   road  truck  
0     1      0  
1     0      1  
2     0      0  
3     0      0  
4     0      0  


#### d) document-term matrix for both ranges defined

## Document-term matrix with ngram_range=(2,2)

In [51]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   airplane fly  bicycle drive  bicycle path  car drive  drive bicycle  \
0             0              0             0          1              0   
1             0              0             0          0              0   
2             0              1             1          0              1   
3             1              0             0          0              0   
4             0              0             0          1              0   

   drive highway  drive racetrack  drive road  fly air  formula car  \
0              0                0           1        0            0   
1              1                0           0        0            0   
2              0                0           0        0            0   
3              0                0           0        1            0   
4              0                1           0        0            1   

   truck drive  
0            0  
1            1  
2            0  
3            0  
4            0  


## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [52]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 13 

The words in the corpus: 
 {'formula', 'road', 'airplane', 'bicycle', 'fly', 'drive', 'racetrack', 'air', 'path', 'car', 'truck', 'highway', '1'}

Term Frequency (TF):
   formula    road  airplane  bicycle     fly   drive  racetrack     air  \
0      0.0  0.3333    0.0000      0.0  0.0000  0.3333        0.0  0.0000   
1      0.0  0.0000    0.0000      0.0  0.0000  0.3333        0.0  0.0000   
2      0.0  0.0000    0.0000      0.5  0.0000  0.2500        0.0  0.0000   
3      0.0  0.0000    0.3333      0.0  0.3333  0.0000        0.0  0.3333   
4      0.2  0.0000    0.0000      0.0  0.0000  0.2000        0.2  0.0000   

   path     car   truck  highway    1  
0  0.00  0.3333  0.0000   0.0000  0.0  
1  0.00  0.0000  0.3333   0.3333  0.0  
2  0.25  0.0000  0.0000   0.0000  0.0  
3  0.00  0.0000  0.0000   0.0000  0.0  
4  0.00  0.2000  0.0000   0.0000  0.2  


### Inverse Document Frequency (IDF)

In [53]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
        formula:      0.699
           road:      0.699
       airplane:      0.699
        bicycle:      0.699
            fly:      0.699
          drive:     0.0969
      racetrack:      0.699
            air:      0.699
           path:      0.699
            car:     0.3979
          truck:      0.699
        highway:      0.699
              1:      0.699


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [54]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
   formula   road  airplane  bicycle    fly   drive  racetrack    air    path  \
0   0.0000  0.233     0.000   0.0000  0.000  0.0323     0.0000  0.000  0.0000   
1   0.0000  0.000     0.000   0.0000  0.000  0.0323     0.0000  0.000  0.0000   
2   0.0000  0.000     0.000   0.3495  0.000  0.0242     0.0000  0.000  0.1748   
3   0.0000  0.000     0.233   0.0000  0.233  0.0000     0.0000  0.233  0.0000   
4   0.1398  0.000     0.000   0.0000  0.000  0.0194     0.1398  0.000  0.0000   

      car  truck  highway       1  
0  0.1326  0.000    0.000  0.0000  
1  0.0000  0.233    0.233  0.0000  
2  0.0000  0.000    0.000  0.0000  
3  0.0000  0.000    0.000  0.0000  
4  0.0796  0.000    0.000  0.1398  


#### e) created Term Frequency (TF) matrix, Inverse Document Frequency (IDF) matrix and a Term Frequency - Inverse Document Frequency (TF-IDF) matrix

## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [55]:
text = '''Unfortunately the football team "Manchester United" did not make it to the knockout stages of the Champions League despite their high efforts.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('Unfortunately', 'RB', 'O'),
 ('the', 'DT', 'B-NP'),
 ('football', 'NN', 'I-NP'),
 ('team', 'NN', 'B-NP'),
 ('``', '``', 'O'),
 ('Manchester', 'NNP', 'O'),
 ('United', 'NNP', 'O'),
 ("''", "''", 'O'),
 ('did', 'VBD', 'O'),
 ('not', 'RB', 'O'),
 ('make', 'VB', 'O'),
 ('it', 'PRP', 'O'),
 ('to', 'TO', 'O'),
 ('the', 'DT', 'B-NP'),
 ('knockout', 'NN', 'I-NP'),
 ('stages', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('Champions', 'NNP', 'O'),
 ('League', 'NNP', 'O'),
 ('despite', 'IN', 'O'),
 ('their', 'PRP$', 'O'),
 ('high', 'JJ', 'O'),
 ('efforts', 'NNS', 'O'),
 ('.', '.', 'O')]


#### f) RB = Adverb. Examples: very, silently, DT = Determiner, NN = Noun, NNP = Proper Noun, VBD = Verb, Past tense, example: took

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [56]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
NT
Windows | 10
Datetime: 2023-12-14 19:01:37
Python Version: 3.11.5
-----------------------------------
