# Import dependencies

In [31]:
import os
import re
import unicodedata
import spacy
import pandas as pd
from matplotlib import pyplot as plt
from dotenv import load_dotenv

# Load dataset

In [32]:
# load the environment variables from the .env file
load_dotenv()
# get the dataset path from the environment variables
dataset_path = os.environ.get("DATASET_PATH")
# get the articles path for the known publisher
articles_path = os.path.join(dataset_path, "articles")
# load the dataset into the notebook
df = pd.read_csv(f"{dataset_path}/article_info_V2.csv", index_col=0, parse_dates=[1])

# Data cleaning

## Define cleaning functions

In [33]:
def clean_string(str):
    # make the string lowercase
    str = str.lower()
    # remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # remove the leading and trailing spaces
    return str.strip()

def parse_list_from_string(str):
    # split the string on each comma
    raw_list = str.split(",")
    # clean every string in the list
    return list(map(clean_string, raw_list))

## Drop unused columns

In [34]:
# drop the author, type and keywords columns
df.drop(columns=["Author", "Type", "Keywords"], inplace=True)

## Drop empty rows

In [35]:
# remove rows with empty date values
df.dropna(subset=["Date"], inplace=True)

## Merge datasets

In [36]:
# read the external dataset
external_df = pd.read_csv(f"{dataset_path}/other-articles.csv", index_col=0, parse_dates=[1], sep=";")
# remove the url column from the external dataset
external_df.drop(columns=["Url"], inplace=True)
# add the external dataset to the main dataset
df = pd.concat([df, external_df])
# reset the index of the main dataset
df.reset_index(drop=True, inplace=True)
# clear the external dataset variable
del external_df

## Remove duplicate articles

In [37]:
# count the number of entries before duplicate removal
entries_before_duplicate_removal = len(df)
# remove duplicate entries by title and date
df.drop_duplicates(subset=["Title", "Date"], keep="first", inplace=True)
# count the number of entries after duplicate removal
entries_after_duplicate_removal = len(df)
# print removed duplicate count
print(f"Removed {entries_before_duplicate_removal - entries_after_duplicate_removal} duplicate entries.")

Removed 3 duplicate entries.


## Parse the string lists

In [38]:
# show the tags column before parsing
print(f"The type of values in the Tags column is {type(df.loc[0, 'Tags'])}.")
# convert the raw string values of the Tags column to lists of strings
df["Tags"] = df["Tags"].apply(parse_list_from_string)
# show the tags column after parsing
print(f"The type of values in the Tags column is {type(df.loc[0, 'Tags'])}.")

The type of values in the Tags column is <class 'str'>.
The type of values in the Tags column is <class 'list'>.


## Define article loading function

In [39]:
# load article by title
def load_article(title, load_contents=True):
    # create file name from title
    file_name = f"{title}.txt"
    # get the path of the article
    file_path = os.path.join(articles_path, file_name)
    # read the article
    file = open(file_path, "r", encoding="utf-8")
    # return the contents of the article if requested
    if load_contents:
        # read the contents of the article
        contents = file.read()
        # close the file
        file.close()
        # return the contents of the article
        return contents
    # close the file
    file.close()
    # return the article path
    return file

## Remove articles which cannot be found

In [40]:
# initialize error count to 0 
err_count = 0

# iterate over dataset with index
for index, row in df.iterrows():
    # get the title of the article
    title = row["Title"]
    try:
        # attempt to load the article
        article = load_article(title)
    except:
        # if the article cannot be loaded, increment the error count
        err_count += 1
        # remove row from dataset
        df.drop(index, inplace=True)
        # continue to next row
        continue

# reset the index of the main dataset
df.reset_index(drop=True, inplace=True)

# print the number of errors
print(f"{err_count} files could not be loaded by title!")

30 files could not be loaded by title!


# Define article cleaning methods

## Load spacy model

In [41]:
# load the small english spacy model
nlp = spacy.load("en_core_web_sm")

## Replace accented characters

In [42]:
# replace characters like é, ë, ï, etc. with their corresponding characters
def remove_accented_characters(text):
    return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")

## Removing newline characters

In [43]:
# remove all newline characters
def remove_newlines(text):
    text = text.replace("\n\n", " ")
    text = text.replace("\n", " ")
    text = text.replace("\r\r", " ")
    text = text.replace("\r", " ")
    return text

## Replace shortened grammar

In [44]:
# replace shortened grammar with full grammar
def replace_grammar(text):
    text = text.replace("it's", "it is")
    text = text.replace("he's", "he is")
    text = text.replace("she's", "she is")
    text = text.replace("'s", " its")
    text = text.replace("'t", " not")
    text = text.replace("'re", " are")
    text = text.replace("'ll", " will")
    text = text.replace("'ve", " have")
    text = text.replace("'d", " would")
    text = text.replace("'m", " am")
    return text

## Remove double whitespace characters

In [45]:
# remove double whitespace characters
def remove_double_whitespace(text):
    return re.sub(r"\s\s+", " ", text)

## Remove noise from text

In [46]:
# remove noise from text
def remove_noise(text):
    # remove newline characters
    text = remove_newlines(text)
    # replace short grammar with full grammar
    text = replace_grammar(text)
    # remove accented characters
    text = remove_accented_characters(text)
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # remove digits
    text = re.sub(r"\d+", "", text)
    # remove double whitespace characters
    text = remove_double_whitespace(text)
    # lowercase the text
    text = text.lower()
    # return the cleaned text
    return text

## Tokenize text

In [47]:
# tokenize the text
def tokenize(text):
    return nlp(text)

## Removal of stop words

In [48]:
# remove stop words from the text
def remove_stop_words(tokens):
    return [token for token in tokens if not token.is_stop]

## Lemmatisation

In [49]:
# lemmatize the text
def lemmatize(tokens):
    return [token.lemma_ for token in tokens]

## Main preprocessing method

In [50]:
# the main preprocessing method which calls all cleaning methods
def preprocess_text(text):
    text = remove_noise(text)
    tokens = tokenize(text)
    tokens = remove_stop_words(tokens)
    tokens = lemmatize(tokens)
    return tokens

In [104]:
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

keyword = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN','VERB']
for token in doc:
    if(token.text in stopwords or token.text in punctuation):
        continue
    if(token.pos_ in pos_tag):
        keyword.append(token.text)

freq_word = Counter(keyword)
freq_word.most_common(10)

[('Moïse', 11),
 ('drug', 11),
 ('Times', 10),
 ('Haiti', 9),
 ('cocaine', 8),
 ('according', 7),
 ('DEA', 7),
 ('trafficking', 6),
 ('officials', 5),
 ('airstrips', 5)]

In [107]:
max_freq = Counter(keyword).most_common(1)[0][1]
for word in freq_word.keys():
    freq_word[word] = (freq_word[word]/max_freq)
freq_word.most_common(5)

[('Moïse', 1.0),
 ('drug', 1.0),
 ('Times', 0.9090909090909091),
 ('Haiti', 0.8181818181818182),
 ('cocaine', 0.7272727272727273)]

In [108]:
sent_strength = {}
for sent in doc.sents:
    for word in sent:
        if word.text in freq_word.keys():
            if sent in sent_strength.keys():
                sent_strength[sent]+=freq_word[word.text]
            else:
                sent_strength[sent]=freq_word[word.text]
print(sent_strength)

{An explosive new report suggests that the high-profile assassination of Jovenel Moïse may have been related to a crackdown on drug trafficking and a list he was compiling of Haitian business and political elites involved in the trade, adding yet another theory to the possible motives for the former president's killing.: 6.636363636363634, 

Before he was shot dead, President Moïse had planned to hand the names over to the US government, according to a New York Times report published December 12.: 4.2727272727272725, The Times spoke to four senior Haitian advisers and officials who had knowledge of the document.: 2.3636363636363633, Unnamed officials also told the Times that the hitmen had confessed to ransacking Moïse's house in search of the list.: 3.3636363636363633, 

"The president had ordered the officials to spare no one, not even the power brokers who had helped propel him into office," the Times reported.: 2.7272727272727266, 

A “central figure” included on the list, accordin

In [110]:
summarized_sentences = nlargest(3, sent_strength, key=sent_strength.get)
final_sentences = [ w.text for w in summarized_sentences ]
summary = ' '.join(final_sentences)
print(summary)



In 2015, Saint-Rémy allegedly met with senior Drug Enforcement Administration (DEA) officials, raising questions of corruption, according to Keith McNichols, a former DEA agent who was investigating the smuggling of hundreds of kilograms of cocaine and heroin from Colombia to Haiti. An explosive new report suggests that the high-profile assassination of Jovenel Moïse may have been related to a crackdown on drug trafficking and a list he was compiling of Haitian business and political elites involved in the trade, adding yet another theory to the possible motives for the former president's killing. The Times previously reported that US anti-drug officials who had worked in Haiti had suspected Saint-Rémy’s involvement in drug trafficking.


In [117]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

# Configure model
model_name = 'google/pegasus-xsum'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

TypeError: 'NoneType' object is not callable

### commented for now - Preprocess all articles and append them to the dataset

In [55]:
# iterate over every row with index
for index, row in df.iterrows():
    # get the current title
    title = row["Title"]
    # get the current text
    article_contents = load_article(title)
    # preprocess the text
    article_contents = preprocess_text(article_contents)
    # join all tokens together
    article_contents = " ".join(article_contents)
    # make the text lowercase
    article_contents = article_contents.lower()
    # add the preprocessed text to the dataset
    df.loc[index, "Preprocessed_Text"] = article_contents
    # show the progress
    print(f"Preprocessed {index + 1} of {len(df)} articles.")

Preprocessed 1 of 10180 articles.
Preprocessed 2 of 10180 articles.
Preprocessed 3 of 10180 articles.
Preprocessed 4 of 10180 articles.
Preprocessed 5 of 10180 articles.
Preprocessed 6 of 10180 articles.
Preprocessed 7 of 10180 articles.
Preprocessed 8 of 10180 articles.
Preprocessed 9 of 10180 articles.
Preprocessed 10 of 10180 articles.
Preprocessed 11 of 10180 articles.
Preprocessed 12 of 10180 articles.
Preprocessed 13 of 10180 articles.
Preprocessed 14 of 10180 articles.
Preprocessed 15 of 10180 articles.
Preprocessed 16 of 10180 articles.
Preprocessed 17 of 10180 articles.
Preprocessed 18 of 10180 articles.
Preprocessed 19 of 10180 articles.
Preprocessed 20 of 10180 articles.
Preprocessed 21 of 10180 articles.
Preprocessed 22 of 10180 articles.
Preprocessed 23 of 10180 articles.
Preprocessed 24 of 10180 articles.
Preprocessed 25 of 10180 articles.
Preprocessed 26 of 10180 articles.
Preprocessed 27 of 10180 articles.
Preprocessed 28 of 10180 articles.
Preprocessed 29 of 10180 arti

KeyboardInterrupt: 

### temporary visualization of the preprocessed data of one article

In [88]:
# define the article index
article_index = 1234
# get the title of an article
title = df["Title"][article_index]
# load the body of the article
article_contents = load_article(title)
# preprocess the text
article_contents = preprocess_text(article_contents)
# show the progress
total_text = " ".join(article_contents)
print(total_text)

# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# wc=WordCloud(max_font_size=60).generate(total_text)
# plt.figure(figsize=(16, 12))
# plt.imshow(wc, interpolation="bilinear")

el salvador attorney general raul melara call evidence witness statement unseal corruption case information available public seemingly transparency melara demand come witness implicate deputy attorney general allan hernandez bribery scheme s land prosecutor jail spokesman attorney general office say measure melara anticorruption fight unseal record expose witness testify deputy alleged corruption salvadoran law allow sealing evidence protect victim witness alike addition withhold identity minor age sealing request prosecutor case judge officially impose regulation certain portion case file withhold view specific statement witness receive benefit exchange testimony people right know request seal lift corruptionrelate case attorney general melara write tweet el salvador news profile majority open corruption case recent year evidence seal identity witness withhold request attorney general office measure prove vital recent embezzling case president antonio saca mauricio fune melaras predec

# Data preparation

## Define one hot encoding function

In [None]:
# get unique values from a 2D array of strings
def get_unique_value_frequency(df_column):
    # create a dictionary to store the unique values
    unique_values = {}
    # iterate over the column
    for value_list in df_column:
        # iterate over the values in the list
        for value in value_list:
            if value not in unique_values:
                # if the value is not in the dictionary, add it
                unique_values[value] = 1
            else:
                # if the value is in the dictionary, increment the value
                unique_values[value] += 1
    # return the dictionary of unique values
    return unique_values

# check if a list contains a certain word and returns a binary boolean value
def list_has_word(l, word):
    return word in l and 1 or 0

# one hot encode a dataframe's column that contains lists of strings in each value
def custom_one_hot_encoding(df, column_name, prefix=None, prefix_sep="_"):
    # create a dictionary to store the one hot encoded columns
    one_hot_encoded_columns = {}
    # get the unique values from the column
    unique_values = get_unique_value_frequency(df[column_name])
    # iterate over the unique values
    for unique_value in unique_values:
        # create a clean string of the unique value
        clean_unique_value = unique_value.replace(" ", "_")
        # create a new column name
        new_column_name = prefix and f"{prefix}{prefix_sep}{clean_unique_value}" or f"{column_name}{prefix_sep}{clean_unique_value}"
        # one hot encode the column using the current unique value
        ohe_list = df[column_name].apply(lambda l: list_has_word(l, unique_value))
        # add the new list to the dictionary
        one_hot_encoded_columns[new_column_name] = ohe_list
    # return a new dataframe with the one hot encoded columns
    return pd.DataFrame(one_hot_encoded_columns)

## Execute one hot encoding function

In [None]:
# one hot encode the tags column of the dataframe
ohe_tags_df = custom_one_hot_encoding(df, "Tags", "tag")
# merge the one hot encoded tags dataframe with the main dataframe by index
df = df.join(ohe_tags_df)
# drop the tags column from the main dataframe
df.drop(columns=["Tags"], inplace=True)
# delete the one hot encoded dataframe variable
del ohe_tags_df