In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing',
           'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
           'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color',
           'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

data = pd.read_csv(dataset_url, names=columns)

# Display basic info
display(data.head())
print("\nDataset Shape:", data.shape)
print("\nMissing Values:\n", data.isnull().sum())

# Check for duplicate entries
duplicates = data.duplicated().sum()
print("\nNumber of duplicate rows:", duplicates)

# Encoding categorical variables
label_encoders = {}
for col in data.columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Summary statistics
print("\nSummary Statistics:\n", data.describe())

# Visualizations
plt.figure(figsize=(12, 6))
sns.countplot(x='class', data=data)
plt.title('Edibility Distribution')
plt.show()

# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), cmap='coolwarm', annot=False)
plt.title('Feature Correlation Matrix')
plt.show()

# Outlier detection with boxplots
plt.figure(figsize=(12, 6))
sns.boxplot(data=data.drop(columns=['class']))
plt.xticks(rotation=90)
plt.title('Feature Distribution with Outliers')
plt.show()

print("Data Preprocessing and EDA Completed.")
