# EDA Explorations

Notebook for exploratory analysis on ADR dataset.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('src/data_sample/data_sample_full_named.csv')
df.head()

In [None]:
# Count of each ADR
adr_cols = df.columns[1:25]
adrs = df[adr_cols].sum().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
sns.barplot(x=adrs.index, y=adrs.values)
plt.xticks(rotation=90)
plt.title('ADR Frequency')
plt.show()

In [None]:
# ADR correlation heatmap
sns.heatmap(df[adr_cols].corr(), cmap='coolwarm')
plt.title('ADR Correlation')
plt.show()

## Top 10 Most Common ADR Combinations

In [None]:

from collections import Counter
combos = df[adr_cols].apply(lambda row: tuple(row.values), axis=1)
combo_counts = Counter(combos)
top_combos = combo_counts.most_common(10)
combo_labels = [str(c[0]) for c in top_combos]
combo_values = [c[1] for c in top_combos]

plt.figure(figsize=(10, 6))
sns.barplot(y=combo_labels, x=combo_values, orient='h')
plt.title("Top 10 Most Common ADR Combinations")
plt.xlabel("Number of Compounds")
plt.ylabel("ADR Binary Pattern")
plt.tight_layout()
plt.show()


## Top 10 Compounds with Most ADRs

In [None]:

df["total_adr"] = df[adr_cols].sum(axis=1)
top_adr_df = df.sort_values(by="total_adr", ascending=False).head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_adr_df["Compound_Name"], y=top_adr_df["total_adr"])
plt.title("Top 10 Compounds with Most ADRs")
plt.xticks(rotation=45, ha='right')
plt.ylabel("Number of ADRs")
plt.tight_layout()
plt.show()


## Distribution of ADR Counts per Compound (Boxplot)

In [None]:

plt.figure(figsize=(6, 4))
sns.boxplot(y=df["total_adr"])
plt.title("Distribution of ADR Counts per Compound")
plt.ylabel("ADR Count")
plt.tight_layout()
plt.show()
