### PROBLEM STATEMENT
Read text from a pdf, tokenize and clean the text. For the extracted tokens, get their synonyms and antonyms.
###SOLUTION


In [44]:
# IMPORTING NECESSARY PACKAGES

# packages to extract text from a pdf
import pdfplumber

# packages to help with word cleaning and tokenising
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# package to work with string functions
import string

# package required to instal the dictionary
from nltk.corpus import wordnet

# package to manipulated data to make data frames
import pandas as pd

In [45]:
# FUNCTION TO EXTRACT TEXT FROM PDF
def extract_text_from_pdf(pdf_path):
    # initializing a variable 'text' as en empty string
    text = ""

    # opeinging the pdf with the path of the pdf passed through the function parameters
    with pdfplumber.open(pdf_path) as pdf:

        # iterating through the pages of the pdf
        for page in pdf.pages:

            # extracting the text from each page and adding it to the variable 'text'
            text += page.extract_text()

    # return the entrire corpus
    return text

In [46]:
# local file path to get the pdf
pdf_path = r"C:\Users\HP\OneDrive\Documents\Post_Grad\Semester_3\NLP\Data\born-a-crime-trevor-noah-extracted.pdf"

# extracting the text in the pdf
text = extract_text_from_pdf(pdf_path)

# displaying the extracted text
print(text)

RUN
Sometimes in big Hollywood movies they’ll have
these crazy chase scenes where somebody jumps or
gets thrown from a moving car. The person hits the
ground and rolls for a bit. Then they come to a stop
and pop up and dust themselves off, like it was no
big deal. Whenever I see that I think, That’s rubbish.
Getting thrown out of a moving car hurts way
worse than that.
I was nine years old when my mother threw me
out of a moving car. It happened on a Sunday. I
know it was on a Sunday because we were coming
home from church, and every Sunday in my
childhood meant church. We never missed church.
My mother was—and still is—a deeply religious
woman. Very Christian. Like indigenous peoplesaround the world, black South Africans adopted the
religion of our colonizers. By “adopt” I mean it was
forced on us. The white man was quite stern with the
native. “You need to pray to Jesus,” he said. “Jesus
will save you.” To which the native replied, “Well, we
do need to be saved—saved from you, but th

In [47]:
# FUNCTION TO CLEAN TEXT
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove punctuation and convert words to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # return cleaned processed words
    return tokens

In [48]:
# using the defined function to process the text
tokens = preprocess_text(text)

# display the cleaned words
print(type(tokens))

<class 'list'>


In [49]:
#sort tokens in ascending order
tokens.sort()

# display the sorted tokens
print(tokens)

['abandoned', 'accrued', 'action', 'action', 'actually', 'actually', 'adopt', 'adopted', 'african', 'african', 'african', 'africans', 'aisles', 'allowed', 'along', 'also', 'always', 'always', 'american', 'analysis', 'analytical', 'ancestors', 'ancestors', 'andsaturday', 'anything', 'arnold', 'ass', 'back', 'back', 'badass', 'badass', 'balanced', 'band', 'basically', 'beating', 'beliefs', 'beside', 'bible', 'bible', 'bible', 'bible', 'bibles', 'big', 'big', 'big', 'bit', 'black', 'black', 'black', 'black', 'black', 'black', 'black', 'black', 'black', 'black', 'blast', 'blessings', 'blessings', 'blouses', 'bouncers', 'boyz', 'boyz', 'burning', 'car', 'car', 'car', 'card', 'cast', 'cast', 'catharsis', 'cathartic', 'changers', 'chapter', 'chase', 'cheerleader', 'childhood', 'childhood', 'christian', 'christian', 'christian', 'christian', 'christian', 'christianity', 'christianity', 'church', 'church', 'church', 'church', 'church', 'church', 'church', 'church', 'church', 'church', 'church',

In [50]:
# downloading the wordnet Dictionary
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [51]:
# function to get synonyms and antonyms
def get_synonyms_antonyms(word):
    synonyms = set()
    antonyms = set()

    # Get the synsets for the word
    synsets = wordnet.synsets(word)

    for synset in synsets:
        for lemma in synset.lemmas():
            # Add synonyms (up to three unique)
            if lemma.name() not in synonyms and len(synonyms) < 3:
                synonyms.add(lemma.name())

            # Add antonyms if available (up to three unique)
            if lemma.antonyms() and lemma.antonyms()[0].name() not in antonyms and len(antonyms) < 3:
                antonyms.add(lemma.antonyms()[0].name())

            # Stop collecting if we have three unique synonyms and antonyms
            if len(synonyms) == 3 and len(antonyms) == 3:
                break

    return ', '.join(synonyms), ', '.join(antonyms)

In [52]:
# convert the list of tokens to a data frame
df = pd.DataFrame(tokens, columns=["Tokens"])

In [53]:
# displaying the created data frame
df.head()

Unnamed: 0,Tokens
0,abandoned
1,accrued
2,action
3,action
4,actually


In [54]:
# adding two columns Synonyms and Antonyms to the data frame based on the Tokens Column
df[['Synonyms', 'Antonyms']] = df['Tokens'].apply(lambda x: pd.Series(get_synonyms_antonyms(x)))

In [55]:
# displaying the newly collected data
df.head(10)

Unnamed: 0,Tokens,Synonyms,Antonyms
0,abandoned,"vacate, give_up, abandon",
1,accrued,"fall, accrued, accrue",
2,action,"action, activeness, activity","inactiveness, inactivity, inaction"
3,action,"action, activeness, activity","inactiveness, inactivity, inaction"
4,actually,"in_reality, really, actually",
5,actually,"in_reality, really, actually",
6,adopt,"follow, adopt, espouse",
7,adopted,"follow, adopt, espouse",native
8,african,African,
9,african,African,
