# Medicine Data Analysis: Cleaning, Exploration, and Visualization

This notebook performs data cleaning, analysis, and visualization on the medicine dataset.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load dataset
medicine_path = "medicine.csv"
df_medicine = pd.read_csv(medicine_path, encoding="utf-8")
df_medicine.head()

In [None]:
# Data Cleaning
# Filling missing numerical values with the mean
num_cols = df_medicine.select_dtypes(include=[np.number]).columns
df_medicine[num_cols] = df_medicine[num_cols].fillna(df_medicine[num_cols].mean())

# Filling missing categorical values with the most frequent value (mode)
cat_cols = df_medicine.select_dtypes(include=[object]).columns
df_medicine[cat_cols] = df_medicine[cat_cols].fillna(df_medicine[cat_cols].mode().iloc[0])

# Removing duplicates
df_medicine = df_medicine.drop_duplicates()
df_medicine.info()

In [None]:
# Function to plot top and bottom categories
def plot_top_bottom(data, column, title, xlabel, top_n=10):
    top_values = data[column].value_counts().head(top_n)
    bottom_values = data[column].value_counts().tail(top_n)

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    sns.barplot(x=top_values.values, y=top_values.index, ax=axes[0], palette="viridis")
    axes[0].set_title(f"Top {top_n} {title}")
    axes[0].set_xlabel(xlabel)

    sns.barplot(x=bottom_values.values, y=bottom_values.index, ax=axes[1], palette="coolwarm")
    axes[1].set_title(f"Bottom {top_n} {title}")
    axes[1].set_xlabel(xlabel)

    plt.tight_layout()
    plt.show()

In [None]:
# Plot distribution of Package Sizes
plt.figure(figsize=(10, 5))
sns.histplot(df_medicine["Package Size"].astype(str).apply(lambda x: ''.join(filter(str.isdigit, x))).astype(float),
             bins=30, kde=True, color="blue")
plt.title("Distribution of Package Sizes")
plt.xlabel("Package Size")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Top and Bottom Dosage Forms
plot_top_bottom(df_medicine, "dosage form", "Dosage Forms", "Count")

In [None]:
# Top and Bottom Manufacturers
plot_top_bottom(df_medicine, "manufacturer", "Manufacturers", "Brand Count")

In [None]:
# Top and Bottom Generic Medicines
plot_top_bottom(df_medicine, "generic", "Generic Medicines", "Count")

# Medicine Data Analysis
### Name: Ranni Rey I. Guadalupe
### Section: BSCPE 2-A

This notebook performs extensive data analysis on the medicine dataset, including data cleaning, comparisons, and various visualizations.

In [None]:
# Scatter Plot: Package Size vs. Strength
plt.figure(figsize=(10, 5))
sns.scatterplot(x=df_medicine["Package Size"].astype(str).apply(lambda x: ''.join(filter(str.isdigit, x))).astype(float),
                y=df_medicine["strength"].astype(str).apply(lambda x: ''.join(filter(str.isdigit, x))).astype(float),
                alpha=0.5, color='red')
plt.title("Scatter Plot: Package Size vs. Strength")
plt.xlabel("Package Size")
plt.ylabel("Strength")
plt.show()

In [None]:
# Pie Chart: Distribution of Medicine Types
plt.figure(figsize=(8, 8))
df_medicine["type"].value_counts().plot.pie(autopct="%1.1f%%", startangle=90, cmap='coolwarm')
plt.title("Distribution of Medicine Types")
plt.ylabel("")
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_medicine.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap of Numeric Features")
plt.show()

In [None]:
# Countplot: Most Common Manufacturers
plt.figure(figsize=(12, 5))
sns.countplot(y=df_medicine["manufacturer"], order=df_medicine["manufacturer"].value_counts().index[:10], palette="viridis")
plt.title("Top 10 Most Common Manufacturers")
plt.xlabel("Count")
plt.ylabel("Manufacturer")
plt.show()

In [None]:
# Boxplot: Strength Distribution by Dosage Form
plt.figure(figsize=(12, 6))
sns.boxplot(x=df_medicine["dosage form"],
            y=df_medicine["strength"].astype(str).apply(lambda x: ''.join(filter(str.isdigit, x))).astype(float),
            palette="Set2")
plt.xticks(rotation=90)
plt.title("Strength Distribution by Dosage Form")
plt.xlabel("Dosage Form")
plt.ylabel("Strength")
plt.show()