In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Make sure the plots directory exists
plots_dir = "./plots/eda"
os.makedirs(plots_dir, exist_ok=True)

# Load the CSV created during the renaming process
df = pd.read_csv("label_summary.csv")
# Columns: ["original_filename","new_filename","medication","dosage","version","augmentation","packaging"]

# Set Seaborn style
sns.set(style="whitegrid")

# 1. Distribution of Packaging Types
packaging_counts = df["packaging"].value_counts()

plt.figure(figsize=(6, 4))
ax = sns.barplot(x=packaging_counts.index, y=packaging_counts.values)
plt.title("Distribution of Packaging Types")
plt.xlabel("Packaging")
plt.ylabel("Count")

# Display y values (count) on the bars
for p in ax.patches:
    ax.annotate(f"{int(p.get_height())}", 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 10), 
                textcoords='offset points')

plt.tight_layout()
plt.savefig(os.path.join(plots_dir, "packaging_distribution.png"))
plt.close()

# 2. Distribution of Dosages (Top 10)
dosage_counts = df["dosage"].value_counts().head(10)

plt.figure(figsize=(6, 4))
ax = sns.barplot(x=dosage_counts.index, y=dosage_counts.values)
plt.title("Distribution of Dosage (Top 10)")
plt.xlabel("Dosage")
plt.ylabel("Count")

# Rotate x-tick labels by 45 degrees
plt.xticks(rotation=45, ha='right')

# Display y values (count) on the bars
for p in ax.patches:
    ax.annotate(f"{int(p.get_height())}", 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 10), 
                textcoords='offset points')

plt.tight_layout()
plt.savefig(os.path.join(plots_dir, "dosage_distribution.png"))
plt.close()

# 3. Top 15 Medications
top_med = df["medication"].value_counts().head(15)

plt.figure(figsize=(6, 4))
ax = sns.barplot(x=top_med.index, y=top_med.values)
plt.title("Top 15 Medications")
plt.xlabel("Medication Name")
plt.ylabel("Count")

# Rotate x-tick labels by 45 degrees
plt.xticks(rotation=45, ha='right')

# Display y values (count) on the bars
for p in ax.patches:
    ax.annotate(f"{int(p.get_height())}", 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 10), 
                textcoords='offset points')

plt.tight_layout()
plt.savefig(os.path.join(plots_dir, "top_medications.png"))
plt.close()

# 4. Combined label combos (medication + dosage + packaging)
df["label_combo"] = (df["medication"] + "_" +
                     df["dosage"] + "_" +
                     df["packaging"])
labelcombo_counts = df["label_combo"].value_counts().head(15)

plt.figure(figsize=(8, 4))
ax = sns.barplot(x=labelcombo_counts.index, y=labelcombo_counts.values)
plt.title("Top 15 Label Combos")
plt.xlabel("Medication_Dosage_Packaging")
plt.ylabel("Count")

# Rotate x-tick labels by 45 degrees and reduce font size further
plt.xticks(rotation=45, ha='right', fontsize=6)  # Reduced font size to 6

# Display y values (count) on the bars
for p in ax.patches:
    ax.annotate(f"{int(p.get_height())}", 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 10), 
                textcoords='offset points')

plt.tight_layout()
plt.savefig(os.path.join(plots_dir, "top_label_combos.png"))
plt.close()

print("EDA plots saved to:", plots_dir)

EDA plots saved to: ./plots/eda
