# Task 1: Data Preprocessing and Exploration

## 1.Load the dataset in Python using pandas or scikit-learn

In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Configure visualization style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

def load_and_preprocess():
    """Load and preprocess the Iris dataset"""
    # 1. Load dataset
    iris = load_iris()
    df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                     columns=iris['feature_names'] + ['target'])
    
    # Map target to species names
    df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
    

## 2.Preprocess:

In [29]:
def complete_load_and_preprocess():
    """Load and preprocess the Iris dataset"""
    # 1. Load dataset
    iris = load_iris()
    df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                     columns=iris['feature_names'] + ['target'])
    
    # Map target to species names
    df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
    
    # 2. Check for missing values
    print("\nMissing values check:")
    print(df.isnull().sum())
    
    # 3. Normalize features (excluding target)
    scaler = MinMaxScaler()
    features = iris['feature_names']
    df[features] = scaler.fit_transform(df[features])
    
    return df, features

## 3. Explore

In [41]:
# Complete Data Processing Pipeline
def explore_data_clean(df, features):
    """Perform data exploration and visualization"""
    # 1. Summary statistics
    print("\nSummary statistics:")
    print(df[features].describe())
    
    # 2. Pairplot
    print("\nGenerating pairplot...")
    sns.pairplot(df, hue='species', palette='viridis')
    plt.savefig('iris_pairplot.png', dpi=300)
    plt.close()
    
    # 3. Correlation heatmap
    print("Generating correlation heatmap...")
    plt.figure(figsize=(8,6))
    sns.heatmap(df[features].corr(), annot=True, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Heatmap')
    plt.savefig('iris_correlation.png', dpi=300)
    plt.close()
    
    # 4. Outlier detection with boxplots
    print("Generating boxplots...")
    plt.figure(figsize=(12,6))
    df_melted = df.melt(id_vars='species', value_vars=features)
    sns.boxplot(x='variable', y='value', hue='species', data=df_melted, palette='Set2')
    plt.title('Feature Distribution by Species')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('iris_boxplots.png', dpi=300)
    plt.close()

def split_data_clean(df, features, test_size=0.2, random_state=42):
    """Split data into train/test sets"""
    X = df[features]
    y = df['target']
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Execute the complete pipeline
print("=== IRIS DATA PROCESSING PIPELINE ===")

# 1. Load and preprocess (if not already done)
if 'df' not in locals() or 'features' not in locals():
    df, features = complete_load_and_preprocess()

print("\nFirst 5 rows:")
print(df.head())

# 2. Explore data
explore_data_clean(df, features)

#Function to split data
# 3. Split data
X_train, X_test, y_train, y_test = split_data_clean(df, features)
print("\nTrain/test split results:")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Feature names: {features}")

# Save processed data
df.to_csv('processed_iris.csv', index=False)
print("\nProcessing complete. Files created:")
print("- iris_pairplot.png")
print("- iris_correlation.png") 
print("- iris_boxplots.png")
print("- processed_iris.csv")

print("\n=== PIPELINE COMPLETED SUCCESSFULLY ===")

=== IRIS DATA PROCESSING PIPELINE ===

First 5 rows:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0           0.222222          0.625000           0.067797          0.041667   
1           0.166667          0.416667           0.067797          0.041667   
2           0.111111          0.500000           0.050847          0.041667   
3           0.083333          0.458333           0.084746          0.041667   
4           0.194444          0.666667           0.067797          0.041667   

   target species  
0     0.0  setosa  
1     0.0  setosa  
2     0.0  setosa  
3     0.0  setosa  
4     0.0  setosa  

Summary statistics:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            0.428704          0.440556           0.467458   
std             0.230018          0.181611           0.299203   
min             0.000000          0.000000           0.000000   
25%   

In [48]:
def generate_synthetic_data():
    """Generate Iris-like synthetic data"""
    np.random.seed(42)
    n_samples = 150
    features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    
    # Create 3 clusters with Gaussian distributions
    cluster1 = np.random.normal(loc=[5.0, 3.4, 1.5, 0.2], scale=[0.4, 0.3, 0.2, 0.1], size=(n_samples//3, 4))
    cluster2 = np.random.normal(loc=[6.0, 2.8, 4.5, 1.3], scale=[0.5, 0.3, 0.3, 0.2], size=(n_samples//3, 4))
    cluster3 = np.random.normal(loc=[6.7, 3.0, 5.7, 2.0], scale=[0.4, 0.3, 0.4, 0.3], size=(n_samples//3, 4))
    
    data = np.vstack([cluster1, cluster2, cluster3])
    target = np.array([0]*50 + [1]*50 + [2]*50)
    
    df = pd.DataFrame(data, columns=features)
    df['target'] = target
    df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
    
    return df, features