In [None]:
import os
import re
import pandas as pd
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import numpy as np

In [None]:
# Load the environment variables
load_dotenv()

str_list_regex_pattern = r"'(.*?)'"

# Get the paths to the data
data_dir = os.getenv("DATA_DIR")
articles_dir = os.path.join(data_dir, "articles")

# Load the main dataset
df = pd.read_csv(f"{data_dir}/keywords_df.csv", index_col=0, parse_dates=[1])

df["Keywords"].replace("[]", np.nan, inplace=True)
df.dropna(subset=["Keywords"], inplace=True)

In [None]:
# Define functions to parse target
def clean_string(str):
    # Make the string lowercase
    str = str.lower()
    # Remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # Remove the leading and trailing spaces
    return str.strip()

def parse_tags(str):
    # Split the string on each comma
    raw_list = str.split(",")
    # Clean every string in the list
    return list(map(clean_string, raw_list))

In [None]:
df["Keywords"] = df["Keywords"].apply(parse_tags)

In [None]:
def get_frequencies_from_lists(dataframe_column):
    # Define a dictionary to store the tag frequencies
    unique_values = {}

    # Loop over the tag values of the dataframe    
    for value_list in dataframe_column:
        # Loop over each tag in the tag list
        for value in value_list:
            # If the tag is not in the dictionary, add it
            new_frequency = 1
            # Attempt to get the tag from the dictionary
            current_frequency = unique_values.get(value)
            # Check if the tag is in the dictionary
            if current_frequency:
                # If the tag is in the dictionary, increment the frequency by 1
                new_frequency = np.add(current_frequency, 1)
            # Set the new value for the tag in the dictionary
            unique_values[value] = new_frequency
    # Return the dictionary
    return unique_values

def get_sorted_frequencies_in_dataframe(dataframe_column):
    # Get the unique values from the lists in a column
    unique_values = get_frequencies_from_lists(dataframe_column)
    # Sort the tags by their frequency, from high to low
    sorted_unique_tag_frequency = sorted(unique_values.items(), key=lambda x: x[1], reverse=True)
    # Convert the list into a dataframe and return it
    return pd.DataFrame(sorted_unique_tag_frequency, columns=[dataframe_column.name, "Frequency"])

In [None]:
unique_keywords = get_sorted_frequencies_in_dataframe(df.Keywords)

In [None]:
# Checks if a word occurs in a list of words
def list_has_word(l, word):
    if word in l:
        return 1
    return 0

def custom_keywords_one_hot_encoding(number_of_keywords):
    # A dictionary which stores the new columns
    one_hot_keyword_columns = {}
    # The names of the new keyword columns
    keyword_column_names = []
    for i in range(0, number_of_keywords):
        # The current unique keyword
        word = unique_keywords.Keywords[i]
        # The new column name
        column_name = f"Keyword_{word}"
        # Add the column name to the list of column names
        keyword_column_names.append(column_name)
        # Add the new column to the dictionary
        one_hot_keyword_columns[column_name] = df["Keywords"].apply(lambda l: list_has_word(l, word))
    return pd.DataFrame(one_hot_keyword_columns, columns=keyword_column_names)

one_hot_keyword_df = custom_keywords_one_hot_encoding(number_of_keywords=4000)

In [None]:
def specificity_score(target, predictions):
    return 1 - recall_score(target, predictions, average="weighted")

def get_metrics(target, predictions):
    return {
        "recall": recall_score(target, predictions, average="weighted"),
        "precision": precision_score(target, predictions, average="weighted"),
        "accuracy": accuracy_score(target, predictions),
        "specificity": specificity_score(target, predictions),
        "f1": f1_score(target, predictions, average="weighted"),
    }

def print_metrics(target, predictions):
    metrics = get_metrics(target=target, predictions=predictions)
    for metric in metrics:
        str_metric = "{:.2f}".format(metrics[metric] * 100)+"%"
        print(f"{metric}: {str_metric}")

In [None]:
X = one_hot_keyword_df
y = pd.get_dummies(df["Category"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)

# Decision Tree

In [None]:
dtc = DecisionTreeClassifier()
dtc = dtc.fit(X_train,y_train)
y_pred_dtc = dtc.predict(X_test)

# Print the metrics of the model performance
print_metrics(y_test, y_pred_dtc)

In [None]:
y_pred_dtc_train = dtc.predict(X_train)

# Print the metrics of the model performance
print_metrics(y_train, y_pred_dtc_train)

In [None]:
y_pred_dtc_val = dtc.predict(X_val)

# Print the metrics of the model performance
print_metrics(y_val, y_pred_dtc_val)

# K-Nearest Neighbors

In [None]:
knn = KNeighborsClassifier()
knn = knn.fit(X_train,y_train)
y_pred_knn = knn.predict(X_test)

# Print the metrics of the model performance
print_metrics(y_test, y_pred_knn)

# Random forest

In [None]:
rfc = RandomForestClassifier()
rfc = rfc.fit(X_train,y_train)
y_pred_rfc = rfc.predict(X_test)

# Print the metrics of the model performance
print_metrics(y_test, y_pred_rfc)