# Import dependencies

In [None]:
import os
import re
import unicodedata
import spacy
import pandas as pd
from matplotlib import pyplot as plt
from dotenv import load_dotenv

# Load dataset

In [None]:
# load the environment variables from the .env file
load_dotenv()
# get the dataset path from the environment variables
dataset_path = os.environ.get("DATASET_PATH")
# get the articles path for the known publisher
articles_path = os.path.join(dataset_path, "articles")
# load the dataset into the notebook
df = pd.read_csv(f"{dataset_path}/article_info_V2.csv", index_col=0, parse_dates=[1])

# Data cleaning

## Define cleaning functions

In [None]:
def clean_string(str):
    # make the string lowercase
    str = str.lower()
    # remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # remove the leading and trailing spaces
    return str.strip()

def parse_list_from_string(str):
    # split the string on each comma
    raw_list = str.split(",")
    # clean every string in the list
    return list(map(clean_string, raw_list))

## Drop unused columns

In [None]:
# drop the author, type and keywords columns
df.drop(columns=["Author", "Type", "Keywords"], inplace=True)

## Drop empty rows

In [None]:
# remove rows with empty date values
df.dropna(subset=["Date"], inplace=True)

## Merge datasets

In [None]:
# read the external dataset
external_df = pd.read_csv(f"{dataset_path}/other-articles.csv", index_col=0, parse_dates=[1], sep=";")
# remove the url column from the external dataset
external_df.drop(columns=["Url"], inplace=True)
# add the external dataset to the main dataset
df = pd.concat([df, external_df])
# reset the index of the main dataset
df.reset_index(drop=True, inplace=True)
# clear the external dataset variable
del external_df

## Remove duplicate articles

In [None]:
# count the number of entries before duplicate removal
entries_before_duplicate_removal = len(df)
# remove duplicate entries by title and date
df.drop_duplicates(subset=["Title", "Date"], keep="first", inplace=True)
# count the number of entries after duplicate removal
entries_after_duplicate_removal = len(df)
# print removed duplicate count
print(f"Removed {entries_before_duplicate_removal - entries_after_duplicate_removal} duplicate entries.")

## Parse the string lists

In [None]:
# show the tags column before parsing
print(f"The type of values in the Tags column is {type(df.loc[0, 'Tags'])}.")
# convert the raw string values of the Tags column to lists of strings
df["Tags"] = df["Tags"].apply(parse_list_from_string)
# show the tags column after parsing
print(f"The type of values in the Tags column is {type(df.loc[0, 'Tags'])}.")

## Define article loading function

In [None]:
# load article by title
def load_article(title, load_contents=True):
    # create file name from title
    file_name = f"{title}.txt"
    # get the path of the article
    file_path = os.path.join(articles_path, file_name)
    # read the article
    file = open(file_path, "r", encoding="utf-8")
    # return the contents of the article if requested
    if load_contents:
        # read the contents of the article
        contents = file.read()
        # close the file
        file.close()
        # return the contents of the article
        return contents
    # close the file
    file.close()
    # return the article path
    return file

## Remove articles which cannot be found

In [None]:
# initialize error count to 0 
err_count = 0

# iterate over dataset with index
for index, row in df.iterrows():
    # get the title of the article
    title = row["Title"]
    try:
        # attempt to load the article
        article = load_article(title)
    except:
        # if the article cannot be loaded, increment the error count
        err_count += 1
        # remove row from dataset
        df.drop(index, inplace=True)
        # continue to next row
        continue

# reset the index of the main dataset
df.reset_index(drop=True, inplace=True)

# print the number of errors
print(f"{err_count} files could not be loaded by title!")

# Define article cleaning methods

## Load spacy model

In [None]:
# load the small english spacy model
nlp = spacy.load("en_core_web_sm")

## Replace accented characters

In [None]:
# replace characters like é, ë, ï, etc. with their corresponding characters
def remove_accented_characters(text):
    return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")

## Removing newline characters

In [None]:
# remove all newline characters
def remove_newlines(text):
    text = text.replace("\n\n", " ")
    text = text.replace("\n", " ")
    text = text.replace("\r\r", " ")
    text = text.replace("\r", " ")
    return text

## Replace shortened grammar

In [None]:
# replace shortened grammar with full grammar
def replace_grammar(text):
    text = text.replace("it's", "it is")
    text = text.replace("he's", "he is")
    text = text.replace("she's", "she is")
    text = text.replace("'s", " its")
    text = text.replace("'t", " not")
    text = text.replace("'re", " are")
    text = text.replace("'ll", " will")
    text = text.replace("'ve", " have")
    text = text.replace("'d", " would")
    text = text.replace("'m", " am")
    return text

## Remove double whitespace characters

In [None]:
# remove double whitespace characters
def remove_double_whitespace(text):
    return re.sub(r"\s\s+", " ", text)

## Remove noise from text

In [None]:
# remove noise from text
def remove_noise(text):
    # remove newline characters
    text = remove_newlines(text)
    # replace short grammar with full grammar
    text = replace_grammar(text)
    # remove accented characters
    text = remove_accented_characters(text)
    # remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # remove digits
    text = re.sub(r"\d+", "", text)
    # remove double whitespace characters
    text = remove_double_whitespace(text)
    # lowercase the text
    text = text.lower()
    # return the cleaned text
    return text

## Tokenize text

In [None]:
# tokenize the text
def tokenize(text):
    return nlp(text)

## Removal of stop words

In [None]:
# remove stop words from the text
def remove_stop_words(tokens):
    return [token for token in tokens if not token.is_stop]

## Lemmatisation

In [None]:
# lemmatize the text
def lemmatize(tokens):
    return [token.lemma_ for token in tokens]

## Main preprocessing method

In [None]:
# the main preprocessing method which calls all cleaning methods
def preprocess_text(text):
    text = remove_noise(text)
    tokens = tokenize(text)
    tokens = remove_stop_words(tokens)
    tokens = lemmatize(tokens)
    return tokens

### commented for now - Preprocess all articles and append them to the dataset

In [None]:
# iterate over every row with index
for index, row in df.iterrows():
    # get the current title
    title = row["Title"]
    # get the current text
    article_contents = load_article(title)
    # preprocess the text
    article_contents = preprocess_text(article_contents)
    # join all tokens together
    article_contents = " ".join(article_contents)
    # make the text lowercase
    article_contents = article_contents.lower()
    # add the preprocessed text to the dataset
    df.loc[index, "Preprocessed_Text"] = article_contents
    # show the progress
    print(f"Preprocessed {index + 1} of {len(df)} articles.")

### temporary visualization of the preprocessed data of one article

In [None]:
# define the article index
article_index = 1234
# get the title of an article
title = df["Title"][article_index]
# load the body of the article
article_contents = load_article(title)
# preprocess the text
article_contents = preprocess_text(article_contents)
# show the progress
total_text = " ".join(article_contents)
print(total_text)

# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# wc=WordCloud(max_font_size=60).generate(total_text)
# plt.figure(figsize=(16, 12))
# plt.imshow(wc, interpolation="bilinear")

# Data preparation

## Define one hot encoding function

In [None]:
# get unique values from a 2D array of strings
def get_unique_value_frequency(df_column):
    # create a dictionary to store the unique values
    unique_values = {}
    # iterate over the column
    for value_list in df_column:
        # iterate over the values in the list
        for value in value_list:
            if value not in unique_values:
                # if the value is not in the dictionary, add it
                unique_values[value] = 1
            else:
                # if the value is in the dictionary, increment the value
                unique_values[value] += 1
    # return the dictionary of unique values
    return unique_values

# check if a list contains a certain word and returns a binary boolean value
def list_has_word(l, word):
    return word in l and 1 or 0

# one hot encode a dataframe's column that contains lists of strings in each value
def custom_one_hot_encoding(df, column_name, prefix=None, prefix_sep="_"):
    # create a dictionary to store the one hot encoded columns
    one_hot_encoded_columns = {}
    # get the unique values from the column
    unique_values = get_unique_value_frequency(df[column_name])
    # iterate over the unique values
    for unique_value in unique_values:
        # create a clean string of the unique value
        clean_unique_value = unique_value.replace(" ", "_")
        # create a new column name
        new_column_name = prefix and f"{prefix}{prefix_sep}{clean_unique_value}" or f"{column_name}{prefix_sep}{clean_unique_value}"
        # one hot encode the column using the current unique value
        ohe_list = df[column_name].apply(lambda l: list_has_word(l, unique_value))
        # add the new list to the dictionary
        one_hot_encoded_columns[new_column_name] = ohe_list
    # return a new dataframe with the one hot encoded columns
    return pd.DataFrame(one_hot_encoded_columns)

## Execute one hot encoding function

In [None]:
# one hot encode the tags column of the dataframe
ohe_tags_df = custom_one_hot_encoding(df, "Tags", "tag")
# merge the one hot encoded tags dataframe with the main dataframe by index
df = df.join(ohe_tags_df)
# drop the tags column from the main dataframe
df.drop(columns=["Tags"], inplace=True)
# delete the one hot encoded dataframe variable
del ohe_tags_df

# Summarizing

### Chose index number

In [68]:
# Article index number for which we want to see the text
article_index = 7685

### Summarization using spaCy

In [69]:
# Import the necessary libraries
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

# Load the article
title = df["Title"][article_index]
article_contents = load_article(title)
preprocessed_text = preprocess_text(article_contents)
total_text = " ".join(preprocessed_text)

# Define the article text into a variable and a nlp object
doc = article_contents
doc_nlp = nlp(doc)

# Define the function to get the top n keywords
keyword = []
stopwords = list(STOP_WORDS)
pos_tag = ['PROPN', 'ADJ', 'NOUN','VERB']
for token in doc_nlp:
    if(token.text in stopwords or token.text in punctuation):
        continue
    if(token.pos_ in pos_tag):
        keyword.append(token.text)

# Get the top 10 keywords
freq_word = Counter(keyword)
freq_word.most_common(10)

# Get the weighted keywords
max_freq = Counter(keyword).most_common(1)[0][1]
for word in freq_word.keys():
    freq_word[word] = (freq_word[word]/max_freq)
freq_word.most_common(10)

# Set the weight for each sentence
sent_strength = {}
for sent in doc_nlp.sents:
    for word in sent:
        if word.text in freq_word.keys():
            if sent in sent_strength.keys():
                sent_strength[sent]+=freq_word[word.text]
            else:
                sent_strength[sent]=freq_word[word.text]

# Compute and combine the weights of the sentences
summarized_sentences = nlargest(3, sent_strength, key=sent_strength.get)
final_sentences = [ w.text for w in summarized_sentences ]
summary = ' '.join(final_sentences)

# Print the summary
print(summary)



InSight Crime Analysis

The charges filed against the accused, the highest ranking military officials charged with links to organized crime in fifteen years, could deal a blow to President Calderon’s anti-organized crime legacy. 

On July 31, a federal judge in Toluca, Mexico State, charged six high-ranking members of the Mexican army, including the former second-highest ranking defense official and three retired generals, for ties to the Beltran Leyva Organization (BLO), reported Proceso. Six high-ranking members of the Mexican army, including four generals, have been formally charged with collaborating with drug traffickers in what is likely the biggest corruption case of Felipe Calderon’s presidency.


### Frequency from words with weights

In [70]:
# Print the weighted keywords
freq_word.most_common(10)

[('army', 1.0),
 ('crime', 0.875),
 ('Calderon', 0.75),
 ('organized', 0.75),
 ('retired', 0.625),
 ('ranking', 0.5),
 ('Mexican', 0.5),
 ('charged', 0.5),
 ('members', 0.375),
 ('including', 0.375)]

### Summarization using Pegasus

In [71]:
# Import the necessary libraries
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

# Configure model
model_name = 'google/pegasus-xsum'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

# Set the article
src_text = doc

# Create a summary
batch = tokenizer(src_text, truncation=True, padding='longest',return_tensors='pt')
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)

# Print the summary
print(tgt_text)

['All photographs courtesy of AFP, EPA, Getty Images and Reuters']


### Get title and tags from article

In [72]:
# Print the title of the article based on the index number
print(df["Title"][article_index])

print()

# Print the tags of the article based on the index number
print(df["Tags"][article_index])

Mexican Generals Charged with Drug Trafficking

['mexico', 'los monos', 'el salvador', 'bolivia', 'colombia personalities', 'beltran leyva org']


### Print article

In [75]:
# Print the article contents if necessary
print(article_contents)

Six high-ranking members of the Mexican army, including four generals, have been formally charged with collaborating with drug traffickers in what is likely the biggest corruption case of Felipe Calderon’s presidency.

On July 31, a federal judge in Toluca, Mexico State, charged six high-ranking members of the Mexican army, including the former second-highest ranking defense official and three retired generals, for ties to the Beltran Leyva Organization (BLO), reported Proceso. The judge also charged four members of the BLO, including US-born Hector Valdez Villareal, alias “La Barbie,” whose testimony is thought to have implicated the army officials.

Four of the six accused have been detained since May under the pre-trial detention process known as “arraigo,” which allows prosecutors to hold individuals suspected of participating in organized crime for up to 80 days. The four are: former Assistant Secretary of Defense Tomas Angeles Dauahare (pictured), retired Divisional General Ricar