In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("./data/processed/collected_data.csv")

In [None]:
data.head()

### Check Null and Data types

In [None]:
data.info()

### Display summary statistics of this dataset

In [None]:
data.describe()

### Display random 10 memes

In [None]:
import random
from PIL import Image

base_dir = "./data/raw/HM Dataset/img"
sample_data = data.sample(10, random_state=random.randint(1, 1000))

# Plot 10 random images
plt.figure(figsize=(15, 8))
for i, (_, row) in enumerate(sample_data.iterrows()):
    img_path = os.path.join(base_dir, os.path.basename(row['img']))  # construct full path
    if os.path.exists(img_path):
        img = Image.open(img_path)
        plt.subplot(2, 5, i + 1)
        plt.imshow(img)
        plt.axis('off')
        plt.title(f"Label: {row['label']}")
    else:
        print(f"⚠️ Image not found: {img_path}")

plt.tight_layout()
plt.show()

### Lebel Distribution

In [None]:
label_counts = data['label'].value_counts()
label_percentages = data['label'].value_counts(normalize=True) * 100

label_distribution = pd.DataFrame({
    'Count': label_counts,
    'Percentage': label_percentages.round(2).astype(str) + '%'
})

print("--- Label Distribution (Counts and Percentages) ---")
print(label_distribution)

In [None]:
plt.figure(figsize=(8, 6))
label_counts.plot(kind='bar', color=['skyblue', 'salmon'])

plt.title('Distribution of Labels in the Dataset')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--')

for i, count in enumerate(label_counts):
    plt.text(i, count + 0.1, str(count), ha='center', va='bottom')

plt.tight_layout()
plt.show()