# Practical 2: Data Preprocessing and Visualization
## Diabetic Retinopathy Image Dataset

### 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import os
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

### 2. Load Dataset

In [None]:
if os.path.exists('dataset_info.csv'):
    df = pd.read_csv('dataset_info.csv')
    print(f"Dataset loaded: {len(df)} images")
else:
    dataset_path = 'colored_images/colored_images/'
    categories = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
    
    data = []
    for category in categories:
        category_path = os.path.join(dataset_path, category)
        images = [f for f in os.listdir(category_path) if f.endswith('.png')]
        
        for img_name in images:
            img_path = os.path.join(category_path, img_name)
            eye_side = 'left' if 'left' in img_name else 'right'
            patient_id = img_name.split('_')[0]
            
            data.append({
                'filename': img_name,
                'filepath': img_path,
                'category': category,
                'patient_id': patient_id,
                'eye_side': eye_side
            })
    
    df = pd.DataFrame(data)
    print(f"Dataset created: {len(df)} images")

df.head()

### 3. Data Cleaning - Check for Missing Values

In [None]:
print("Missing values in dataset:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

### 4. Extract Image Features

In [None]:
sample_size = min(200, len(df))
sample_df = df.sample(n=sample_size, random_state=42).copy()

features = []
for idx, row in sample_df.iterrows():
    try:
        img = Image.open(row['filepath'])
        img_array = np.array(img)
        
        features.append({
            'width': img.width,
            'height': img.height,
            'mean_red': img_array[:,:,0].mean(),
            'mean_green': img_array[:,:,1].mean(),
            'mean_blue': img_array[:,:,2].mean(),
            'std_red': img_array[:,:,0].std(),
            'std_green': img_array[:,:,1].std(),
            'std_blue': img_array[:,:,2].std(),
            'brightness': img_array.mean()
        })
    except:
        features.append({k: np.nan for k in ['width', 'height', 'mean_red', 'mean_green', 'mean_blue', 'std_red', 'std_green', 'std_blue', 'brightness']})

features_df = pd.DataFrame(features)
sample_df = pd.concat([sample_df.reset_index(drop=True), features_df], axis=1)

print(f"Features extracted for {len(sample_df)} images")
sample_df.head()

### 5. Handle Missing Values and Outliers

In [None]:
print("Missing values after feature extraction:")
print(sample_df.isnull().sum())

numeric_cols = ['width', 'height', 'mean_red', 'mean_green', 'mean_blue', 'std_red', 'std_green', 'std_blue', 'brightness']
for col in numeric_cols:
    if sample_df[col].isnull().any():
        sample_df[col].fillna(sample_df[col].median(), inplace=True)

print("\nMissing values after filling:")
print(sample_df.isnull().sum())

In [None]:
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers)

print("Outliers detected:")
for col in ['brightness', 'mean_red', 'mean_green', 'mean_blue']:
    outlier_count = detect_outliers(sample_df, col)
    print(f"{col}: {outlier_count} outliers")

### 6. Data Visualization - Distribution Plots

In [None]:
plt.figure(figsize=(10, 6))
category_counts = df['category'].value_counts()
sns.barplot(x=category_counts.index, y=category_counts.values, palette='viridis')
plt.title('Distribution of Images Across Categories', fontsize=16, fontweight='bold')
plt.xlabel('Category', fontsize=12)
plt.ylabel('Number of Images', fontsize=12)
plt.xticks(rotation=45)
for i, v in enumerate(category_counts.values):
    plt.text(i, v + 10, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
eye_counts = df['eye_side'].value_counts()
plt.pie(eye_counts.values, labels=eye_counts.index, autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff'])
plt.title('Distribution of Left vs Right Eye Images', fontsize=14, fontweight='bold')
plt.show()

### 7. Feature Visualization

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=sample_df, x='category', y='brightness', palette='Set2')
plt.title('Brightness Distribution by Category', fontsize=14, fontweight='bold')
plt.xlabel('Category', fontsize=12)
plt.ylabel('Brightness', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

sns.histplot(sample_df['mean_red'], kde=True, color='red', ax=axes[0])
axes[0].set_title('Red Channel Distribution', fontweight='bold')
axes[0].set_xlabel('Mean Red Value')

sns.histplot(sample_df['mean_green'], kde=True, color='green', ax=axes[1])
axes[1].set_title('Green Channel Distribution', fontweight='bold')
axes[1].set_xlabel('Mean Green Value')

sns.histplot(sample_df['mean_blue'], kde=True, color='blue', ax=axes[2])
axes[2].set_title('Blue Channel Distribution', fontweight='bold')
axes[2].set_xlabel('Mean Blue Value')

plt.tight_layout()
plt.show()

### 8. Correlation Analysis

In [None]:
plt.figure(figsize=(10, 8))
correlation_cols = ['mean_red', 'mean_green', 'mean_blue', 'std_red', 'std_green', 'std_blue', 'brightness']
correlation_matrix = sample_df[correlation_cols].corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', square=True, linewidths=1)
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

### 9. Feature Scaling

In [None]:
scaler = StandardScaler()
features_to_scale = ['mean_red', 'mean_green', 'mean_blue', 'std_red', 'std_green', 'std_blue', 'brightness']

sample_df[features_to_scale] = scaler.fit_transform(sample_df[features_to_scale])

print("Features scaled successfully!")
print("\nScaled feature statistics:")
print(sample_df[features_to_scale].describe())

### 10. Label Encoding

In [None]:
le_category = LabelEncoder()
le_eye = LabelEncoder()

sample_df['category_encoded'] = le_category.fit_transform(sample_df['category'])
sample_df['eye_side_encoded'] = le_eye.fit_transform(sample_df['eye_side'])

print("Category encoding:")
for i, cat in enumerate(le_category.classes_):
    print(f"{cat}: {i}")

print("\nEye side encoding:")
for i, eye in enumerate(le_eye.classes_):
    print(f"{eye}: {i}")

### 11. Save Preprocessed Data

In [None]:
sample_df.to_csv('preprocessed_data.csv', index=False)
print("Preprocessed data saved to 'preprocessed_data.csv'")
print(f"\nFinal dataset shape: {sample_df.shape}")
print(f"Columns: {list(sample_df.columns)}")

### 12. Summary Report

In [None]:
print("="*60)
print("DATA PREPROCESSING SUMMARY")
print("="*60)
print(f"Total images processed: {len(sample_df)}")
print(f"Features extracted: {len(features_to_scale)}")
print(f"Missing values handled: Yes")
print(f"Outliers detected: Yes")
print(f"Feature scaling applied: StandardScaler")
print(f"Label encoding applied: Yes")
print("="*60)