In [None]:
import pandas as pd
from model_tuner import loadObjects
from sklearn.model_selection import train_test_split
from NaiveSVC import NaivelyCalibratedLinearSVC
import numpy as np
from sklearn.metrics import f1_score

In [None]:
drug_df = pd.read_pickle("../../data/outcomes_squashed/combined_data.pkl")

drug_df_old = pd.read_pickle("/home/afunnell/Code/Rapid_overdose/Data/tfidf.pkl")

In [None]:
cols = [
    "Methamphetamine",
    "Heroin",
    "Cocaine",
    "Fentanyl",
    "Alcohol",
    "Prescription.opioids",
    "Any Opioids",
    "Benzodiazepines",
]

In [None]:
only_others_count = drug_df[
    (drug_df["Others"] == 1) & (drug_df[cols].sum(axis=1) == 0)
].shape[0]

In [None]:
only_others_count

In [None]:
others_with_substances_count = drug_df[
    (drug_df["Others"] == 1) & (drug_df[cols].sum(axis=1) > 0)
].shape[0]

In [None]:
others_with_substances_count

In [None]:
len(drug_df[(drug_df["Others"] == 1) & (drug_df["Heroin"] == 1)])

In [None]:
for index, row in drug_df.iterrows():
    if "ethanolism".lower() in row["text"].lower():
        print(row)

In [None]:
for index, row in drug_df.iterrows():
    if (
        "METHYLENEDIOXYMETHAMPHETAMINE".lower() in row["text"].lower()
        or "MDMA".lower() in row["text"].lower()
    ):
        print("Hi")

In [None]:
naive_svc = loadObjects(
    "../../models/classic_ml_models/single_label/bioclinicalbert/Methamphetamine_SVM.pkl"
)

In [None]:
y = drug_df["Methamphetamine"].values
X = drug_df["clinBERTEmbed"].values
X = np.stack(X, axis=0)
n_samples, sequence_length, n_features = X.shape
X = X.reshape(-1, n_features)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
y_pred = naive_svc.predict(X_test)

In [None]:
pd.DataFrame(y_test).value_counts()

In [None]:
f1_score(y_test, y_pred, average="micro")

In [None]:
from itertools import combinations


def top_cooccurrence_drug_overdoses(df, drug_cols, top_n=20, max_combination_length=5):
    """
    Calculates the top N co-occurring drug overdoses in the dataset for combinations
    up to the specified maximum length.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the drug overdose data.
    drug_cols (list): List of drug columns to analyze for co-occurrence.
    top_n (int): The number of top co-occurrences to return.
    max_combination_length (int): The maximum number of drugs in a combination.

    Returns:
    pd.DataFrame: A DataFrame with the top N co-occurrences and their counts.
    """
    co_occurrence_counts = {}

    # Loop through each combination length (from 2 up to max_combination_length)
    for combination_length in range(2, max_combination_length + 1):
        # Generate all combinations of the specified length
        for drug_combo in combinations(drug_cols, combination_length):
            # Count cases where all drugs in the combination are present (i.e., all are 1)
            count = df[list(drug_combo)].all(axis=1).sum()
            if count > 0:
                co_occurrence_counts[drug_combo] = count

    # Convert to DataFrame for easy sorting and selection
    co_occurrence_df = pd.DataFrame(
        [(combo, count) for combo, count in co_occurrence_counts.items()],
        columns=["Drug Combination", "Count"],
    )

    # Sort by count and get the top N combinations
    top_co_occurrences = co_occurrence_df.nlargest(top_n, "Count")

    return top_co_occurrences

In [None]:
drug_df

In [None]:
drug_df[drug_df["Any Drugs"] == 1]

In [None]:
drug_cols = [
    "Methamphetamine",
    "Heroin",
    "Cocaine",
    "Fentanyl",
    "Alcohol",
    "Prescription.opioids",
    "Benzodiazepines",
    "Others",
]

# Assuming your DataFrame is named df
top_co_occurrences = top_cooccurrence_drug_overdoses(drug_df, drug_cols)
print(top_co_occurrences)

In [None]:
def plot_top_drug_combinations_with_labels(top_co_occurrences):
    """
    Plots a bar chart of the top drug combinations based on their counts
    and adds count labels to each bar.

    Parameters:
    top_co_occurrences (pd.DataFrame): DataFrame containing 'Drug Combination' and 'Count' columns.
    """
    # Convert the drug combinations from tuples to strings for labeling
    top_co_occurrences = top_co_occurrences.copy()
    top_co_occurrences["Drug Combination"] = top_co_occurrences[
        "Drug Combination"
    ].apply(lambda combo: ", ".join(combo))

    # Set the plot style
    sns.set_style("whitegrid")

    # Create a bar plot
    plt.figure(figsize=(12, 8))
    barplot = sns.barplot(
        data=top_co_occurrences, x="Count", y="Drug Combination", palette="viridis"
    )

    # Add count labels to each bar
    for i, bar in enumerate(barplot.patches):
        count = bar.get_width()  # Get the width of each bar (corresponds to count)
        plt.text(
            count - 40,  # Slightly offset to the right of the bar
            bar.get_y() + bar.get_height() / 2,  # Vertically center the label
            f"{int(count)}",  # Format as an integer
            ha="center",  # Horizontal alignment
            va="center",  # Vertical alignment
            color="white",
        )

        percentage = count / 8738 * 100

        plt.text(
            count + 40,
            bar.get_y() + bar.get_height() / 2,  # Vertically center the label
            f"{int(percentage)}%",  # Format as an integer
            ha="center",  # Horizontal alignment
            va="center",  # Vertical alignment
            color="black",
        )

    # Set plot labels and title
    plt.xlabel("Number of Overdoses")
    plt.ylabel("Drug Combination")
    plt.title("Top 20 Drug Combinations in Overdoses")

    # Adjust layout to fit labels
    plt.tight_layout()

    # Show the plot
    plt.show()

In [None]:
plot_top_drug_combinations_with_labels(top_co_occurrences)

In [None]:
drug_df["Others"].value_counts()