# Import dependencies

In [None]:
import os
import re
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Load dataset

In [None]:
# Load the environment variables from the .env file
load_dotenv()
# Get the dataset path from the environment variables
dataset_path = os.environ.get("DATASET_PATH")
# Load the dataset into the notebook
df = pd.read_csv(f"{dataset_path}/article_info_V3.csv", index_col=0, parse_dates=[1])

In [None]:
def convert_str_lists():
    # Define the regex pattern required to get the tags and keywords from the strings
    str_list_regex_pattern = r"'(.*?)'"

    # Make the tags accessible as lists of strings
    df["Tags"] = df["Tags"].apply(lambda str_list: re.findall(str_list_regex_pattern, str_list))

    # Make the keywords accessible as lists of strings
    df["Keywords"] = df["Keywords"].apply(lambda str_list: re.findall(str_list_regex_pattern, str_list))

    # Make the types of the articles accessible as lists of strings
    df["Type"] = df["Type"].apply(lambda str_list: re.findall(str_list_regex_pattern, str_list))

convert_str_lists()

# Define method to get unique values and their frequency

In [None]:
def get_frequencies_from_lists(dataframe_column):
    # Define a dictionary to store the tag frequencies
    unique_values = {}

    # Loop over the tag values of the dataframe    
    for value_list in dataframe_column:
        # Loop over each tag in the tag list
        for value in value_list:
            # If the tag is not in the dictionary, add it
            new_frequency = 1
            # Attempt to get the tag from the dictionary
            current_frequency = unique_values.get(value)
            # Check if the tag is in the dictionary
            if current_frequency:
                # If the tag is in the dictionary, increment the frequency by 1
                new_frequency = np.add(current_frequency, 1)
            # Set the new value for the tag in the dictionary
            unique_values[value] = new_frequency
    # Return the dictionary
    return unique_values

def get_sorted_frequencies_in_dataframe(dataframe_column):
    # Get the unique values from the lists in a column
    unique_values = get_frequencies_from_lists(dataframe_column)
    # Sort the tags by their frequency, from high to low
    sorted_unique_tag_frequency = sorted(unique_values.items(), key=lambda x: x[1], reverse=True)
    # Convert the list into a dataframe and return it
    return pd.DataFrame(sorted_unique_tag_frequency, columns=[dataframe_column.name, "Frequency"])

# Get the unique tags and keywords

In [None]:
unique_tags = get_sorted_frequencies_in_dataframe(df.Tags)
unique_keywords = get_sorted_frequencies_in_dataframe(df.Keywords)

# Perform custom one hot encoding

In [None]:
# Checks if a word occurs in a list of words
def list_has_word(l, word):
    if word in l:
        return 1
    return 0

def custom_keywords_one_hot_encoding(number_of_keywords):
    # A dictionary which stores the new columns
    one_hot_keyword_columns = {}
    # The names of the new keyword columns
    keyword_column_names = []
    for i in range(0, number_of_keywords):
        # The current unique keyword
        word = unique_keywords.Keywords[i]
        # The new column name
        column_name = f"Keyword_{word}"
        # Add the column name to the list of column names
        keyword_column_names.append(column_name)
        # Add the new column to the dictionary
        one_hot_keyword_columns[column_name] = df["Keywords"].apply(lambda l: list_has_word(l, word))
    return pd.DataFrame(one_hot_keyword_columns, columns=keyword_column_names)

one_hot_keyword_df = custom_keywords_one_hot_encoding(number_of_keywords=5000)

# Add the one hot encoded values to the dataframe

In [None]:
# One hot encode the cocaine tag and add it to the dataframe
df["tags_contain_cocaine"] = df["Tags"].apply(lambda l: list_has_word(l, "cocaine"))
# Add the one hot encoded keywords to the dataframe using a left join
df = df.join(one_hot_keyword_df)
# Remove the old variable, to free up some memory
del one_hot_keyword_df

# Write the dataframe to a new CSV file

In [None]:
# Write the dataframe to a csv file
df.to_csv(f"{dataset_path}/article_info_V4.csv")