In [1]:
# ============ COMPLETE ERROR-FREE SOLUTION ============
# Import all required libraries at the top
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# Try to import seaborn (optional)
try:
    import seaborn as sns
    SEABORN_AVAILABLE = True
except ImportError:
    SEABORN_AVAILABLE = False
    print("Note: Seaborn not available - using matplotlib for visualizations")

def preprocess_iris_data():
    """Complete preprocessing for Iris dataset"""
    print("\n=== IRIS DATA PREPROCESSING ===")
    
    # Load data
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['species'] = iris.target_names[iris.target]
    
    # Check for missing values
    print("\nMissing values check:")
    print(df.isnull().sum())
    
    # Normalize features
    scaler = MinMaxScaler()
    df[iris.feature_names] = scaler.fit_transform(df[iris.feature_names])
    
    # Summary statistics
    print("\nSummary statistics:")
    print(df.describe())
    
    # Visualizations
    print("\nGenerating visualizations...")
    
    # Boxplot (works with or without seaborn)
    plt.figure(figsize=(12, 6))
    df.boxplot()
    plt.title('Feature Distributions with Outliers')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('iris_boxplot.png')
    plt.close()
    print("- Saved boxplot as iris_boxplot.png")
    
    # Correlation heatmap
    plt.figure(figsize=(8, 6))
    if SEABORN_AVAILABLE:
        sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
    else:
        corr = df.corr(numeric_only=True)
        plt.imshow(corr, cmap='coolwarm')
        plt.colorbar()
        plt.xticks(range(len(corr.columns)), corr.columns, rotation=45)
        plt.yticks(range(len(corr.columns)), corr.columns)
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.savefig('iris_heatmap.png')
    plt.close()
    print("- Saved correlation heatmap as iris_heatmap.png")
    
    # Pairplot alternative
    from pandas.plotting import scatter_matrix
    scatter_matrix(df[iris.feature_names], figsize=(12, 8), diagonal='kde')
    plt.suptitle('Feature Relationships')
    plt.tight_layout()
    plt.savefig('iris_scatter_matrix.png')
    plt.close()
    print("- Saved scatter matrix as iris_scatter_matrix.png")
    
    # Train-test split
    X = df[iris.feature_names]
    y = df['species']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print("\nData split complete:")
    print(f"Training samples: {len(X_train)}")
    print(f"Test samples: {len(X_test)}")
    
    return X_train, X_test, y_train, y_test

def perform_clustering(X_train, y_train):
    """Perform K-Means clustering analysis"""
    print("\n=== CLUSTERING ANALYSIS ===")
    
    # K-Means with k=3
    kmeans = KMeans(n_clusters=3, random_state=42)
    kmeans.fit(X_train)
    labels = kmeans.predict(X_train)
    
    # Compare with actual classes
    ari = adjusted_rand_score(y_train, labels)
    print(f"\nAdjusted Rand Index (k=3): {ari:.3f}")
    
    # Elbow method
    print("\nDetermining optimal k with elbow method...")
    distortions = []
    K_range = range(1, 10)
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_train)
        distortions.append(kmeans.inertia_)
    
    plt.figure(figsize=(8, 5))
    plt.plot(K_range, distortions, 'bx-')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Distortion')
    plt.title('Elbow Method for Optimal k')
    plt.savefig('elbow_plot.png')
    plt.close()
    print("- Saved elbow plot as elbow_plot.png")
    
    # Try k=2 and k=4
    for k in [2, 4]:
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X_train)
        ari = adjusted_rand_score(y_train, labels)
        print(f"Adjusted Rand Index (k={k}): {ari:.3f}")
    
    # Visualization of clusters
    plt.figure(figsize=(12, 6))
    
    # Actual species plot
    plt.subplot(1, 2, 1)
    colors = {'setosa':'red', 'versicolor':'green', 'virginica':'blue'}
    for species, color in colors.items():
        subset = X_train[y_train == species]
        plt.scatter(
            subset.iloc[:, 2], 
            subset.iloc[:, 3], 
            c=color,
            label=species
        )
    plt.title('Actual Species')
    plt.legend()
    
    # Clusters plot
    plt.subplot(1, 2, 2)
    plt.scatter(
        X_train.iloc[:, 2], 
        X_train.iloc[:, 3], 
        c=labels,
        cmap='viridis'
    )
    plt.title('K-Means Clusters (k=3)')
    
    plt.tight_layout()
    plt.savefig('clusters_vs_actual.png')
    plt.close()
    print("- Saved cluster comparison as clusters_vs_actual.png")

# ============ MAIN EXECUTION ============
if __name__ == "__main__":
    print("=== IRIS DATASET ANALYSIS ===")
    print("This script will:")
    print("1. Preprocess the Iris dataset")
    print("2. Perform clustering analysis")
    print("3. Generate visualizations\n")
    
    try:
        # Run preprocessing
        X_train, X_test, y_train, y_test = preprocess_iris_data()
        
        # Run clustering
        perform_clustering(X_train, y_train)
        
        print("\n=== ANALYSIS COMPLETE ===")
        print("Generated the following files:")
        print("- iris_boxplot.png (feature distributions)")
        print("- iris_heatmap.png (correlations)")
        print("- iris_scatter_matrix.png (feature relationships)")
        print("- elbow_plot.png (optimal cluster count)")
        print("- clusters_vs_actual.png (cluster comparison)")
        
    except Exception as e:
        print(f"\nError occurred: {str(e)}")
        print("\nTroubleshooting tips:")
        print("1. Ensure you have the required packages:")
        print("   pip install pandas scikit-learn matplotlib")
        print("2. If you want enhanced visualizations:")
        print("   pip install seaborn")
        print("3. Restart your Python kernel after installation")

Note: Seaborn not available - using matplotlib for visualizations
=== IRIS DATASET ANALYSIS ===
This script will:
1. Preprocess the Iris dataset
2. Perform clustering analysis
3. Generate visualizations


=== IRIS DATA PREPROCESSING ===

Missing values check:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
species              0
dtype: int64

Summary statistics:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            0.428704          0.440556           0.467458   
std             0.230018          0.181611           0.299203   
min             0.000000          0.000000           0.000000   
25%             0.222222          0.333333           0.101695   
50%             0.416667          0.416667           0.567797   
75%             0.583333          0.541667           0.694915   
max             1.000000          1.000000           1.000000   

     