# Install library if needed

In [1]:
# pip install pypdf
# pip install textblob
# pip install nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')
# pip install word2number

# PDF Extraction

In [2]:
from pypdf import PdfReader
pages = []

reader = PdfReader("Hong_Kong.pdf")
for i in range(len(reader.pages)):
    page = reader.pages[i]
    pages.append(page.extract_text())
    
content = "".join(pages)
print(content)

HONG KONG
TRAVEL GUIDEHong Kong is a bustling, truly global destination located on China’s south coast. Set 
amongst beautiful Victoria Harbour, it’s picturesque and enthralling. Temples, monuments and towers go hand-in-hand with world-class restaurants, designer boutiques and musical performances. Hong Kong’s contemporary culture, a mix of Asian languages, and a tolerance for different religions creates a harmonious environment. Hong Kong is just over 9 hours flight from Sydney Airport and is renowned for being a luxurious and rewarding holiday destination for all types of travellers.OVERVIEWHONG KONGThis travel guide is for your general information only and is not intended as advice. You should 
make your own inquiries before making any decisions. Sydney Airport Corporation Limited does 
not in any way represent that we recommend or endorse the ratings, advertisers, products or 
services appearing in this guide. We do not represent or warrant that the material in the guide is 
reliab

# Preprocessing
## Case Conversion

In [3]:
content = content.lower()
print(content)

hong kong
travel guidehong kong is a bustling, truly global destination located on china’s south coast. set 
amongst beautiful victoria harbour, it’s picturesque and enthralling. temples, monuments and towers go hand-in-hand with world-class restaurants, designer boutiques and musical performances. hong kong’s contemporary culture, a mix of asian languages, and a tolerance for different religions creates a harmonious environment. hong kong is just over 9 hours flight from sydney airport and is renowned for being a luxurious and rewarding holiday destination for all types of travellers.overviewhong kongthis travel guide is for your general information only and is not intended as advice. you should 
make your own inquiries before making any decisions. sydney airport corporation limited does 
not in any way represent that we recommend or endorse the ratings, advertisers, products or 
services appearing in this guide. we do not represent or warrant that the material in the guide is 
reliab

## Punctuation Handling (all exclamation points (!) to periods (.))

In [4]:
content = content.replace('!', '.')
print(content)

hong kong
travel guidehong kong is a bustling, truly global destination located on china’s south coast. set 
amongst beautiful victoria harbour, it’s picturesque and enthralling. temples, monuments and towers go hand-in-hand with world-class restaurants, designer boutiques and musical performances. hong kong’s contemporary culture, a mix of asian languages, and a tolerance for different religions creates a harmonious environment. hong kong is just over 9 hours flight from sydney airport and is renowned for being a luxurious and rewarding holiday destination for all types of travellers.overviewhong kongthis travel guide is for your general information only and is not intended as advice. you should 
make your own inquiries before making any decisions. sydney airport corporation limited does 
not in any way represent that we recommend or endorse the ratings, advertisers, products or 
services appearing in this guide. we do not represent or warrant that the material in the guide is 
reliab

## White space removal

In [5]:
content = content.strip()
print(content)

hong kong
travel guidehong kong is a bustling, truly global destination located on china’s south coast. set 
amongst beautiful victoria harbour, it’s picturesque and enthralling. temples, monuments and towers go hand-in-hand with world-class restaurants, designer boutiques and musical performances. hong kong’s contemporary culture, a mix of asian languages, and a tolerance for different religions creates a harmonious environment. hong kong is just over 9 hours flight from sydney airport and is renowned for being a luxurious and rewarding holiday destination for all types of travellers.overviewhong kongthis travel guide is for your general information only and is not intended as advice. you should 
make your own inquiries before making any decisions. sydney airport corporation limited does 
not in any way represent that we recommend or endorse the ratings, advertisers, products or 
services appearing in this guide. we do not represent or warrant that the material in the guide is 
reliab

## Tokenization

In [6]:
import nltk
remover = nltk.RegexpTokenizer(r"\w+")
clean = remover.tokenize(content)

print(clean)

['hong', 'kong', 'travel', 'guidehong', 'kong', 'is', 'a', 'bustling', 'truly', 'global', 'destination', 'located', 'on', 'china', 's', 'south', 'coast', 'set', 'amongst', 'beautiful', 'victoria', 'harbour', 'it', 's', 'picturesque', 'and', 'enthralling', 'temples', 'monuments', 'and', 'towers', 'go', 'hand', 'in', 'hand', 'with', 'world', 'class', 'restaurants', 'designer', 'boutiques', 'and', 'musical', 'performances', 'hong', 'kong', 's', 'contemporary', 'culture', 'a', 'mix', 'of', 'asian', 'languages', 'and', 'a', 'tolerance', 'for', 'different', 'religions', 'creates', 'a', 'harmonious', 'environment', 'hong', 'kong', 'is', 'just', 'over', '9', 'hours', 'flight', 'from', 'sydney', 'airport', 'and', 'is', 'renowned', 'for', 'being', 'a', 'luxurious', 'and', 'rewarding', 'holiday', 'destination', 'for', 'all', 'types', 'of', 'travellers', 'overviewhong', 'kongthis', 'travel', 'guide', 'is', 'for', 'your', 'general', 'information', 'only', 'and', 'is', 'not', 'intended', 'as', 'advi

## Stop Word Removal using NLTK

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text_tokens = word_tokenize(content)

clean = [word for word in text_tokens if not word in stopwords.words()]
content = ' '.join(clean)
print(content)

hong kong travel guidehong kong bustling , global destination located china ’ south coast . set beautiful victoria harbour , ’ picturesque enthralling . temples , monuments towers hand-in-hand world-class restaurants , designer boutiques musical performances . hong kong ’ contemporary culture , mix asian languages , tolerance different religions creates harmonious environment . hong kong 9 hours flight sydney airport renowned luxurious rewarding holiday destination types travellers.overviewhong kongthis travel guide general information intended advice . make inquiries making decisions . sydney airport corporation limited represent recommend endorse ratings , advertisers , products services appearing guide . represent warrant material guide reliable , accurate , complete current , accept responsibility arising errors omissions . liable loss resulting action decision reliance material guide . hong kong bustling , global destination located china ’ south coast . set beautiful victoria har

## Lemmatization

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer=WordNetLemmatizer()
input_str=word_tokenize(content)

lemmatized_content = []

for word in input_str:
    lemmatized_word = lemmatizer.lemmatize(word)
    lemmatized_content.append(lemmatized_word)
    
# Joining the lemmatized words back into a string
content = ' '.join(lemmatized_content)
    
print(content)

hong kong travel guidehong kong bustling , global destination located china ’ south coast . set beautiful victoria harbour , ’ picturesque enthralling . temple , monument tower hand-in-hand world-class restaurant , designer boutique musical performance . hong kong ’ contemporary culture , mix asian language , tolerance different religion creates harmonious environment . hong kong 9 hour flight sydney airport renowned luxurious rewarding holiday destination type travellers.overviewhong kongthis travel guide general information intended advice . make inquiry making decision . sydney airport corporation limited represent recommend endorse rating , advertiser , product service appearing guide . represent warrant material guide reliable , accurate , complete current , accept responsibility arising error omission . liable loss resulting action decision reliance material guide . hong kong bustling , global destination located china ’ south coast . set beautiful victoria harbour , ’ picturesqu

## Extract Hyponyms and Hypernyms from Text

In [9]:
from nltk.corpus import wordnet
# Tokenize the content into words
words = nltk.word_tokenize(content)

# Initialize lists to store hyponyms and hypernyms
hyponyms = []
hypernyms = []

# Iterate over each word in the content
for word in words:
    synsets = wordnet.synsets(word)
    for synset in synsets:
        # Extract hyponyms (more specific words)
        hyponyms.extend([lemma.name() for lemma in synset.hyponyms()])
        # Extract hypernyms (more general words)
        hypernyms.extend([lemma.name() for lemma in synset.hypernyms()])

# Remove duplicate hyponyms and hypernyms
hyponyms = list(set(hyponyms))
hypernyms = list(set(hypernyms))

# Print the extracted hyponyms and hypernyms
print("Hyponyms:", hyponyms)
print("Hypernyms:", hypernyms)

Hyponyms: ['rocket_larkspur.n.01', 'trencher.n.02', 'vanish.v.02', 'daemon.n.02', 'naturalness.n.01', 'isomerize.v.01', 'concert.n.01', 'moonwalk.n.01', 'dessert_wine.n.01', 'virtu.n.03', 'relax.v.07', 'savory.n.04', 'hungarian.n.01', 'sass.n.01', 'slave.v.01', 'backspace.v.01', 'alienate.v.03', 'magnetization.n.02', 'date.n.05', 'deck.v.02', 'appointment.n.01', 'entrance.n.02', 'film_star.n.01', 'corner.v.02', 'demagnetize.v.02', 'sect.n.01', 'fair_chance.n.01', 'lance.v.01', 'strand.n.02', 'deep.n.03', 'impressiveness.n.02', 'emote.v.01', 'pass.n.15', 'book.n.02', 'freshen.v.02', 'pet_shop.n.01', 'papua.n.01', 'ocean_current.n.01', 'cry_out_for.v.01', 'rupture.n.03', 'hit.v.02', 'tip-off.n.01', 'linger.v.04', 'savarin.n.01', 'tetrazzini.n.01', 'tarot_card.n.01', 'pump.v.06', 'enumerate.v.01', 'pathos.n.03', 'mourn.v.02', 'dead_person.n.01', 'echelon.n.01', 'pop.n.03', 'second_base.n.01', 'continue.v.07', 'adventurer.n.01', 'table.v.02', 'vanishing_point.n.02', 'come.v.04', 'lightning

## Numbers Normalisation (has limitation)

In [10]:
from word2number import w2n
import re

def normalize_numbers(content):
    # Find all written numbers in the content
    written_numbers = re.findall(r'\b(one|two|three|four|five|six|seven|eight|nine|ten|\w+teen|\w+tenth|\w+tenths|\w+ty|\w+th)\b', content, flags=re.IGNORECASE)

    for word_number in written_numbers:
        try:
            # Convert to numeric value
            numeric_value = w2n.word_to_num(word_number)
            # Replace with numeric value
            content = content.replace(word_number, str(numeric_value))
        except ValueError:
            pass
    
    return content

content = normalize_numbers(content)
print(content)


hong kong travel guidehong kong bustling , global destination located china ’ south coast . set beautiful victoria harbour , ’ picturesque enthralling . temple , monument tower hand-in-hand world-class restaurant , designer boutique musical performance . hong kong ’ contemporary culture , mix asian language , tolerance different religion creates harmonious environment . hong kong 9 hour flight sydney airport renowned luxurious rewarding holiday destination type travellers.overviewhong kongthis travel guide general information intended advice . make inquiry making decision . sydney airport corporation limited represent recommend endorse rating , advertiser , product service appearing guide . represent warrant material guide reliable , accurate , complete current , accept responsibility arising error omission . liable loss resulting action decision reliance material guide . hong kong bustling , global destination located china ’ south coast . set beautiful victoria harbour , ’ picturesqu