# Import dependencies

In [None]:
import os
import re
import pandas as pd
from matplotlib import pyplot as plt
from dotenv import load_dotenv

# Load dataset

In [None]:
# load the environment variables from the .env file
load_dotenv()
# get the dataset path from the environment variables
dataset_path = os.environ.get("DATASET_PATH")
# get the articles path for the known publisher
articles_path = os.path.join(dataset_path, "articles")
# load the dataset into the notebook
df = pd.read_csv(f"{dataset_path}/article_info_V2.csv", index_col=0, parse_dates=[1])

# Data cleaning

## Define cleaning functions

In [None]:
def clean_string(str):
    # make the string lowercase
    str = str.lower()
    # remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # remove the leading and trailing spaces
    return str.strip()

def parse_list_from_string(str):
    # split the string on each comma
    raw_list = str.split(",")
    # clean every string in the list
    return list(map(clean_string, raw_list))

## Drop unused columns

In [None]:
# drop the author, type and keywords columns
df.drop(columns=["Author", "Type", "Keywords"], inplace=True)

## Drop empty rows

In [None]:
# remove rows with empty date values
df.dropna(subset=["Date"], inplace=True)

## Merge datasets

In [None]:
# read the external dataset
external_df = pd.read_csv(f"{dataset_path}/other-articles.csv", index_col=0, parse_dates=[1], sep=";")
# remove the url column from the external dataset
external_df.drop(columns=["Url"], inplace=True)
# add the external dataset to the main dataset
df = pd.concat([df, external_df])
# reset the index of the main dataset
df.reset_index(drop=True, inplace=True)
# clear the external dataset variable
del external_df

## Remove duplicate articles

In [None]:
# count the number of entries before duplicate removal
entries_before_duplicate_removal = len(df)
# remove duplicate entries by title and date
df.drop_duplicates(subset=["Title", "Date"], keep="first", inplace=True)
# count the number of entries after duplicate removal
entries_after_duplicate_removal = len(df)
# print removed duplicate count
print(f"Removed {entries_before_duplicate_removal - entries_after_duplicate_removal} duplicate entries.")

## Parse the string lists

In [None]:
# show the tags column before parsing
print(f"The type of values in the Tags column is {type(df.loc[0, 'Tags'])}.")
# convert the raw string values of the Tags column to lists of strings
df["Tags"] = df["Tags"].apply(parse_list_from_string)
# show the tags column after parsing
print(f"The type of values in the Tags column is {type(df.loc[0, 'Tags'])}.")

## Define article loading function

In [None]:
# load article by title
def load_article(title, load_contents=False):
    # create file name from title
    file_name = f"{title}.txt"
    # get the path of the article
    file_path = os.path.join(articles_path, file_name)
    # read the article
    file = open(file_path, "r", encoding="utf-8")
    # return the contents of the article if requested
    if load_contents:
        return file.read()
    # return the article path
    return file

## Remove articles which cannot be found

In [None]:
# new_df = df.copy()
titles = df["Title"]
err_count = 0
for title in titles:
    try:
        load_article(title)
    except:
        print(f"Failed: {title}")
        err_count += 1

print(f"{err_count} files could not be loaded by title!")

# Data preparation

## Define one hot encoding function

In [None]:
# get unique values from a 2D array of strings
def get_unique_value_frequency(df_column):
    # create a dictionary to store the unique values
    unique_values = {}
    # iterate over the column
    for value_list in df_column:
        # iterate over the values in the list
        for value in value_list:
            if value not in unique_values:
                # if the value is not in the dictionary, add it
                unique_values[value] = 1
            else:
                # if the value is in the dictionary, increment the value
                unique_values[value] += 1
    # return the dictionary of unique values
    return unique_values

# check if a list contains a certain word and returns a binary boolean value
def list_has_word(l, word):
    return word in l and 1 or 0

# one hot encode a dataframe's column that contains lists of strings in each value
def custom_one_hot_encoding(df, column_name, prefix=None, prefix_sep="_"):
    # create a dictionary to store the one hot encoded columns
    one_hot_encoded_columns = {}
    # get the unique values from the column
    unique_values = get_unique_value_frequency(df[column_name])
    # iterate over the unique values
    for unique_value in unique_values:
        # create a clean string of the unique value
        clean_unique_value = unique_value.replace(" ", "_")
        # create a new column name
        new_column_name = prefix and f"{prefix}{prefix_sep}{clean_unique_value}" or f"{column_name}{prefix_sep}{clean_unique_value}"
        # one hot encode the column using the current unique value
        ohe_list = df[column_name].apply(lambda l: list_has_word(l, unique_value))
        # add the new list to the dictionary
        one_hot_encoded_columns[new_column_name] = ohe_list
    # return a new dataframe with the one hot encoded columns
    return pd.DataFrame(one_hot_encoded_columns)

## Execute one hot encoding function

In [None]:
# one hot encode the tags column of the dataframe
ohe_tags_df = custom_one_hot_encoding(df, "Tags", "tag")
# merge the one hot encoded tags dataframe with the main dataframe by index
df = df.join(ohe_tags_df)
# delete the one hot encoded dataframe variable
del ohe_tags_df