# Import packages

In [None]:
import os
import re
import spacy
import pandas as pd
from dotenv import load_dotenv

# Load data

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# Load the environment variables
load_dotenv()

# Get the paths to the data
data_dir = os.getenv("DATA_DIR")
articles_dir = os.path.join(data_dir, "articles")

# Load the main dataset
df = pd.read_csv(f"{data_dir}/article_info_V2.csv", index_col=0, parse_dates=[1])

# Load the external dataset
external_df = pd.read_csv(f"{data_dir}/other-articles.csv", index_col=0, parse_dates=[1], sep=";")

# Cleaning

In [None]:
# Drop unused columns
df.drop(columns=["Author", "Type", "Keywords"], inplace=True)

# Drop rows with empty date values
df.dropna(subset=["Date"], inplace=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

# Remove the url column from the external dataset
external_df.drop(columns=["Url"], inplace=True)

# Merging datasets

In [None]:
# Add the external dataset to the main dataset
df = pd.concat([df, external_df])

# Reset the index
df.reset_index(drop=True, inplace=True)

# Clear the external dataset variable
del external_df

# Remove duplicates

In [None]:
# Count the number of entries before duplicate removal
entries_count = len(df)

# remove duplicate entries by title and date
df.drop_duplicates(subset=["Title", "Date"], keep="first", inplace=True)

# Print the number of deleted entries
print(f"Removed {entries_count - len(df)} duplicate entries.")

# Clear the entries count variable
del entries_count

# Parse target

In [None]:
# Define functions to parse target
def clean_string(str):
    # Make the string lowercase
    str = str.lower()
    # Remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # Remove the leading and trailing spaces
    return str.strip()

def parse_tags(str):
    # Split the string on each comma
    raw_list = str.split(",")
    # Clean every string in the list
    return list(map(clean_string, raw_list))

In [None]:
# Convert the raw string values of the Tags column to lists of strings
df["Tags"] = df["Tags"].apply(parse_tags)

In [None]:
# Clear the cleaning and parsing functions
del clean_string, parse_tags

# Load articles

In [None]:
# Article loading function
def load_article(title):
    # Create filename from title
    file_name = f"{title}.txt"
    # Get the path of the article
    file_path = os.path.join(articles_dir, file_name)
    try:
        # Attempt to read the article
        file = open(file_path, "r", encoding="utf-8")
    except:
        # If the article could not be read, return None
        return None
    # Read the contents of the article
    contents = file.read()
    # Close the file
    file.close()
    # Return the contents of the article
    return contents

In [None]:
# Load the articles
df["Contents"] = df["Title"].apply(load_article)

# Clear the article loading function
del load_article

In [None]:
# Count the number of entries in the dataframe
entries_count = len(df)

# Remove rows without article contents
df.dropna(subset=["Contents"], inplace=True)

# Reset the index
df.reset_index(drop=True, inplace=True)

# Print the number of deleted entries
print(f"Removed {entries_count - len(df)} entries without article contents.")

# Clear the entries count variable
del entries_count

# Create categories from tags

In [None]:
unique_tags = []
for tag_list in df["Tags"]:
    for tag in tag_list:
        if tag not in unique_tags:
            unique_tags.append(tag)

In [None]:
custom_categories = pd.read_csv(f"{data_dir}/custom-categories.csv", sep=";")

def load_category(name):
    try:
        return custom_categories[name].dropna().tolist()
    except:
        return None

In [None]:
# Initiate the category dictionary
categories = {}

# Iterate over these custom_categories columns
for category in custom_categories.columns:
    # Load the category
    categories[category] = load_category(category)

In [None]:
# Create a new column for the category
df["Category"] = ""

# Iterate over the dataframe
for index, row in df.iterrows():
    # Get the tags of the current row
    tag_list = row["Tags"]
    # Create a dictionary for the counters of each category
    counters = {}
    # Iterate over every tag in the tag list
    for tag in tag_list:
        # Iterate over the categories
        for category in categories:
            # Check if the tag is in the category
            if tag in categories[category]:
                # If so, check if the category is already in the dictionary
                if category in counters:
                    # If so, increment the counter
                    counters[category] += 1
                else:
                    # If not, add the category to the dictionary
                    counters[category] = 1
            # elif counters.get("Other"):
            #     counters["Other"] += 1
            # else:
            #     counters["Other"] = 1
    
    # Get the category with the highest counter and assign it to the row
    row["Category"] = max(counters, key=counters.get)
    # Make sure that the counter is minimally 0



# def get_most_frequent_category(environmental_count, violence_count, weapons_count, money_count, human_trafficking_count, drugs_count):
#     highest_count = max(environmental_count, violence_count, weapons_count, human_trafficking_count, money_count, drugs_count)
#     if highest_count == 0:
#         return "Other"
#     if environmental_count == highest_count:
#         return "Environmental"
#     elif violence_count == highest_count:
#         return "Violence"
#     elif weapons_count == highest_count:
#         return "Weapons"
#     elif human_trafficking_count == highest_count:
#         return "Human trafficking"
#     elif money_count == highest_count:
#         return "Money"
#     elif drugs_count == highest_count:
#         return "Drugs"

# for current_article_index in range(len(df)):
#     tag_list = df["Tags"][current_article_index]

#     environmental_count = 0
#     violence_count = 0
#     weapons_count = 0
#     human_trafficking_count = 0
#     money_count = 0
#     drugs_count = 0

#     for tag in tag_list:
#         if tag in environmental_tags:
#             environmental_count += 1
#         elif tag in violence_tags:
#             violence_count += 1
#         elif tag in weapons_tags:
#             weapons_count += 1
#         elif tag in human_trafficking_tags:
#             human_trafficking_count += 1
#         elif tag in drugs_tags:
#             drugs_count += 1
#         elif tag in money_tags:
#             money_count += 1

#     df["Category"][current_article_index] = get_most_frequent_category(environmental_count, violence_count, weapons_count, money_count, human_trafficking_count, drugs_count)

In [None]:
df["Category"].value_counts()

In [None]:
# # remove location, environmental, violence, weapons, money, human trafficking and drugs tags from the list of unique tags
# remaining_tags = [tag for tag in unique_tags if tag not in environmental_tags and tag not in violence_tags and tag not in weapons_tags and tag not in money_tags and tag not in human_trafficking_tags and tag not in drugs_tags]
# remaining_tags

# print(f"{len(remaining_tags)} / {len(unique_tags)} unique tags have not been categorized yet.")

# print(remaining_tags)