In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
red_wine = pd.read_csv('winequality-red.csv', sep=';')
white_wine = pd.read_csv('winequality-white.csv', sep=';')

# Add wine type
red_wine['wine_type'] = 'red'
white_wine['wine_type'] = 'white'

# Combine datasets
wine_df = pd.concat([red_wine, white_wine], ignore_index=True)

# Clean column names
wine_df.columns = wine_df.columns.str.strip().str.lower().str.replace(' ', '_')

# Preview data
print("Dataset shape:", wine_df.shape)
print("Columns:", wine_df.columns.tolist())
print(wine_df.head())

# Check for missing values
print("\nMissing values per column:\n", wine_df.isnull().sum())

# Check for duplicates
num_duplicates = wine_df.duplicated().sum()
print(f"\nNumber of duplicate rows: {num_duplicates}")
wine_df = wine_df.drop_duplicates()

# Summary statistics
print("\nSummary statistics:\n", wine_df.describe())

# Value counts of quality
print("\nWine quality distribution:\n", wine_df['quality'].value_counts().sort_index())

# Create a quality label (optional - for classification task)
def quality_label(q):
    if q <= 4:
        return 'low'
    elif q <= 6:
        return 'medium'
    else:
        return 'high'

wine_df['quality_label'] = wine_df['quality'].apply(quality_label)

# ======================
# 📊 EDA Visualizations
# ======================

sns.set(style='whitegrid', palette='muted')

# Quality distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='quality', hue='wine_type', data=wine_df)
plt.title('Quality Distribution by Wine Type')
plt.show()

# Boxplot: alcohol vs quality
plt.figure(figsize=(10, 6))
sns.boxplot(x='quality', y='alcohol', hue='wine_type', data=wine_df)
plt.title('Alcohol Content by Wine Quality')
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
corr = wine_df.corr(numeric_only=True)
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.show()

# Compare alcohol content between red and white
plt.figure(figsize=(8, 5))
sns.histplot(data=wine_df, x='alcohol', hue='wine_type', bins=30, kde=True)
plt.title('Alcohol Content Distribution')
plt.show()
