In [1]:
# Import required libraries

import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
pd.options.mode.chained_assignment = None

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# importing data set
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Research/Data Frames/title-dataset.csv")
# observing
df.head()

Unnamed: 0,Title,Intervention Area
0,"""Aarogya Setu"": The mobile application that mo...",Digital Information Services
1,"""I'm Not against Online Teaching, but What abo...",Policy & Regulation or Digital Services
2,"""What is the best method of family planning fo...",Digital Information Services
3,2018 Mobile Industry Impact Report: Sustainabl...,Digital Infrastructure Development
4,26 Recommendations on Content Governance: A Gu...,Cybersecurity


In [5]:
# lowercasing
df["Title"] = df["Title"].str.lower()

In [6]:
# remove punctuations

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["Title"] = df["Title"].apply(lambda text: remove_punctuation(text))

In [7]:
# remove numbers

def remove_numbers(text):
    number_pattern = re.compile(r'\d+')
    return number_pattern.sub(r'', text)

df["Title"] = df["Title"].apply(lambda text: remove_numbers(text))

In [8]:
# remove non alphabetic 
def remove_non_alpha(text):
    non_alpha_pattern = re.compile(r'[^a-zA-Z\s]')
    return non_alpha_pattern.sub(r'', text)

df["Title"] = df["Title"].apply(lambda text: remove_non_alpha(text))

In [9]:
# remove extra white space
def remove_extra_whitespaces(text):
    whitespace_pattern = re.compile(r'\s+')
    return whitespace_pattern.sub(' ', text)

df["Title"] = df["Title"].apply(lambda text: remove_extra_whitespaces(text))

In [10]:
# remove one letter word such as e a l
def remove_single_letter_words(text):
    return " ".join([word for word in str(text).split() if len(word) > 1])

df["Title"] = df["Title"].apply(lambda text: remove_single_letter_words(text))

In [11]:
# check if there are duplicates
df.duplicated(subset=['Title']).any() #no duplicates

True

In [12]:
# preprocessing 

import re #Regular expressions
import nltk #NLP library
import spacy
import string
from bs4 import BeautifulSoup #HTML tags
from nltk.corpus import stopwords #english stopwords
#lematising
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
countries=['afghanistan', 'albania', 'algeria', 'andorra', 'angola', 'antigua', 'argentina', 
           'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas', 'bahrain', 'bangladesh', 
           'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bhutan', 'bolivia', 'bosnia', 
           'botswana', 'brazil', 'brunei', 'bulgaria', 'burkina', 'burundi', 'cambodia', 'cameroon', 
           'canada', 'cape', 'central', 'chad', 'chile', 'china', 'colombia', 'comoros', 'congo', 
           'costa', 'croatia', 'cuba', 'cyprus', 'czech', 'denmark', 'djibouti', 'dominica', 
           'dominican', 'east', 'ecuador', 'egypt', 'el', 'equatorial', 'eritrea', 'estonia', 
           'eswatini', 'ethiopia', 'fiji', 'finland', 'france', 'gabon', 'gambia', 'georgia', 
           'germany', 'ghana', 'greece', 'grenada', 'guatemala', 'guinea', 'guyana', 'haiti', 
           'honduras', 'hungary', 'iceland', 'india', 'indonesia', 'iran', 'iraq', 'ireland', 
           'israel', 'italy', 'jamaica', 'japan', 'jordan', 'kazakhstan', 'kenya', 'kiribati', 
           'korea', 'kosovo', 'kuwait', 'kyrgyzstan', 'laos', 'latvia', 'lebanon', 'lesotho', 
           'liberia', 'libya', 'liechtenstein', 'lithuania', 'luxembourg', 'madagascar', 'malawi', 
           'malaysia', 'maldives', 'mali', 'malta', 'marshall', 'mauritania', 'mauritius', 
           'mexico', 'micronesia', 'moldova', 'monaco', 'mongolia', 'montenegro', 'morocco', 
           'mozambique', 'myanmar', 'namibia', 'nauru', 'nepal', 'netherlands', 'new', 'nicaragua', 
           'niger', 'nigeria', 'north', 'north', 'macedonia', 'norway', 'oman', 'pakistan', 'palau', 
           'panama', 'papua', 'paraguay', 'peru', 'philippines', 'poland', 'portugal', 'qatar', 
           'romania', 'russia', 'rwanda', 'saint', 'samoa', 'san', 'sao', 'senegal', 'serbia', 
           'seychelles', 'sierra', 'singapore', 'slovakia', 'slovenia', 'solomon', 'somalia', 
           'south', 'south', 'spain', 'sri', 'sudan', 'suriname', 'sweden', 'switzerland', 
           'syria', 'taiwan', 'tajikistan', 'tanzania', 'thailand', 'timor-leste', 'togo', 
           'tonga', 'trinidad', 'tunisia', 'turkey', 'turkmenistan', 'tuvalu', 'uganda', 
           'ukraine', 'united', 'uruguay', 'uzbekistan', 'vanuatu', 'vatican', 'venezuela', 
           'vietnam', 'yemen', 'zambia', 'zimbabwe','digital','development']

In [15]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

# Read in custom stopwords list from text file
with open('/content/drive/MyDrive/Colab Notebooks/Research/Stoplist.txt', 'r') as file:
    custom_stopwords = set(file.read().split())

# Add custom stopwords to the existing STOPWORDS set
STOPWORDS = STOPWORDS.union(custom_stopwords)

# Add the additional stopwords to the STOPWORDS set
STOPWORDS = STOPWORDS.union(countries)

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word.lower() not in STOPWORDS])

df["Title"] = df["Title"].apply(lambda text: remove_stopwords(text))

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [None]:
# lemmatisation

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["Title"] = df["Title"].apply(lambda text: lemmatize_words(text))
df.head()

In [19]:
# remove stopwords once again to get rid of any
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

# read in custom stopwords list from text file
with open('/content/drive/MyDrive/Colab Notebooks/Research/Stoplist.txt', 'r') as file:
    custom_stopwords = set(file.read().split())

# add custom stopwords to the existing STOPWORDS set
STOPWORDS = STOPWORDS.union(custom_stopwords)

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["Title"] = df["Title"].apply(lambda text: remove_stopwords(text))

In [20]:
df.to_csv("/content/drive/MyDrive/Colab Notebooks/Research/preprocessed_title_OVR.csv",index=True)