# Import dependencies

In [None]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

# Load the dataset

In [None]:
# Load the cleaned dataset into the notebook
df = pd.read_csv("~/Documents/skc_data/article_info_V4.csv", index_col=0, parse_dates=[1])

In [None]:
def convert_str_lists():
    # Define the regex pattern required to get the tags and keywords from the strings
    str_list_regex_pattern = r"'(.*?)'"

    # Make the tags accessible as lists of strings
    df["Tags"] = df["Tags"].apply(lambda str_list: re.findall(str_list_regex_pattern, str_list))

    # Make the keywords accessible as lists of strings
    df["Keywords"] = df["Keywords"].apply(lambda str_list: re.findall(str_list_regex_pattern, str_list))

    # Make the types of the articles accessible as lists of strings
    df["Type"] = df["Type"].apply(lambda str_list: re.findall(str_list_regex_pattern, str_list))

convert_str_lists()

# Get top 5000 keyword column names

In [None]:
def get_frequencies_from_lists(dataframe_column):
    # Define a dictionary to store the tag frequencies
    unique_values = {}

    # Loop over the tag values of the dataframe    
    for value_list in dataframe_column:
        # Loop over each tag in the tag list
        for value in value_list:
            # If the tag is not in the dictionary, add it
            new_frequency = 1
            # Attempt to get the tag from the dictionary
            current_frequency = unique_values.get(value)
            # Check if the tag is in the dictionary
            if current_frequency:
                # If the tag is in the dictionary, increment the frequency by 1
                new_frequency = np.add(current_frequency, 1)
            # Set the new value for the tag in the dictionary
            unique_values[value] = new_frequency
    # Return the dictionary
    return unique_values

def get_sorted_frequencies_in_dataframe(dataframe_column):
    # Get the unique values from the lists in a column
    unique_values = get_frequencies_from_lists(dataframe_column)
    # Sort the tags by their frequency, from high to low
    sorted_unique_tag_frequency = sorted(unique_values.items(), key=lambda x: x[1], reverse=True)
    # Convert the list into a dataframe and return it
    return pd.DataFrame(sorted_unique_tag_frequency, columns=[dataframe_column.name, "Frequency"])

In [None]:
# Get the unique keywords and their frequencies from the dataset
unique_keywords = get_sorted_frequencies_in_dataframe(df["Keywords"])
# Get the keyword column names of the top 5000 keywords
keyword_column_names = unique_keywords.Keywords[:5000].apply(lambda x: f"Keyword_{x}")
# Remove the unique keywords from memory
del unique_keywords

# Get the features and target

In [None]:
# Get the feature columns
X = df[keyword_column_names]

# Get the target column
y = df["tags_contain_cocaine"]

# Get training and test data from dataset

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Set the max iterations

In [None]:
MAX_ITERATIONS = 2000000

# Linear SVC Model

In [None]:
# Create a linear SVC model
clf = LinearSVC(max_iter=MAX_ITERATIONS)

# Fit the model to the data
clf = clf.fit(X_train, y_train)

# Run a feedforward pass with the test data
lsvc_predictions = clf.predict(X_test)

# Performance methods

In [None]:
def specificity_score(target, predictions):
    return 1 - recall_score(target, predictions)

def get_metrics(target, predictions):
    return {
        "recall": recall_score(target, predictions),
        "precision": precision_score(target, predictions),
        "accuracy": accuracy_score(target, predictions),
        "specificity": specificity_score(target, predictions),
        "f1": f1_score(target, predictions),
    }

def print_metrics(target, predictions):
    metrics = get_metrics(target=target, predictions=predictions)
    for metric in metrics:
        str_metric = "{:.2f}".format(metrics[metric] * 100)+"%"
        print(f"{metric}: {str_metric}")

# Performance of the Linear SVC Model

In [None]:
# Show a confusion matrix of the predictions
confusion_matrix = ConfusionMatrixDisplay.from_predictions(y_test, lsvc_predictions)
# Print the metrics of the model performance
print_metrics(y_test, lsvc_predictions)

# SGD Model (Stochastic Gradient Descent)

In [None]:
# Create a new SGD Model
sgd_clf = SGDClassifier(max_iter=MAX_ITERATIONS)
# Fit the model to the data
sgd_clf = sgd_clf.fit(X_train, y_train)
# Run a feedforward pass with the test data
sgd_predictions = sgd_clf.predict(X_test)

# Performance of SGD Model

In [None]:
# Show a confusion matrix of the predictions
confusion_matrix = ConfusionMatrixDisplay.from_predictions(y_test, sgd_predictions)
# Print the metrics of the model performance
print_metrics(y_test, sgd_predictions)