# Task 1: *Section 2 - Data Mining*

In [None]:
# Iris Data Preprocessing and Exploration
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
# ----------------------------
# Step 1: Load the Iris dataset
# ----------------------------
iris = load_iris()

# Create a pandas DataFrame
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target  # Add species as numeric label initially

# Map numeric labels to actual species names for clarity
iris_df['species'] = iris_df['species'].map({0:'setosa', 1:'versicolor', 2:'virginica'})

# Display first 5 rows
print("First 5 rows of the dataset:")
print(iris_df.head())

In [None]:
# ---------------------------------
# Step 2: Preprocessing
# ---------------------------------

# 2a: Check for missing values
print("\nMissing values in each column:")
print(iris_df.isnull().sum())

# 2b: Normalize features using Min-Max scaling
scaler = MinMaxScaler()
feature_cols = iris.feature_names
iris_df[feature_cols] = scaler.fit_transform(iris_df[feature_cols])

# 2c: Encode class labels if needed (label encoding shown here)
label_encoder = LabelEncoder()
iris_df['species_encoded'] = label_encoder.fit_transform(iris_df['species'])

print("\nDataset after preprocessing (first 5 rows):")
print(iris_df.head())

In [None]:
# ---------------------------------
# Step 3: Data Exploration
# ---------------------------------

# 3a: Summary statistics
print("\nSummary statistics:")
print(iris_df.describe())

# 3b: Visualizations
# Pairplot
sns.pairplot(iris_df, hue='species')
plt.show()

# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(iris_df[feature_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

# 3c: Identify potential outliers using boxplots
plt.figure(figsize=(10,6))
iris_df.boxplot(column=feature_cols)
plt.title('Boxplot of Iris Features')
plt.show()


In [None]:
# ---------------------------------
# Step 4: Train/Test Split Function
# ---------------------------------

def split_train_test(dataframe, test_size=0.2, random_state=42):
    """Splits the dataset into train and test sets."""
    X = dataframe[feature_cols]  # Features
    y = dataframe['species_encoded']  # Target
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_size, 
                                                        random_state=random_state,
                                                        stratify=y)  # Stratify to preserve class balance
    return X_train, X_test, y_train, y_test

# Example usage
X_train, X_test, y_train, y_test = split_train_test(iris_df)
print("\nTrain features shape:", X_train.shape)
print("Test features shape:", X_test.shape)


# *Task 2: Clustering with K-Means*

In [None]:
# 2a: K-Means with k=3
kmeans3 = KMeans(n_clusters=3, random_state=42)
iris_df['cluster_k3'] = kmeans3.fit_predict(iris_df[feature_cols])

# Compare with actual classes using ARI
ari_k3 = adjusted_rand_score(iris_df['species_encoded'], iris_df['cluster_k3'])
print(f"Adjusted Rand Index for k=3: {ari_k3:.4f}")
# ----------------------------


In [None]:
# 2b: Experiment with k=2 and k=4; Elbow curve
inertia = []
k_values = range(1, 6)
for k in k_values:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(iris_df[feature_cols])
    inertia.append(km.inertia_)

plt.figure(figsize=(8,5))
plt.plot(k_values, inertia, 'bo-')
plt.xlabel('Number of Clusters k')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

In [None]:
# 2c: Visualize clusters (petal length vs width) for k=3
plt.figure(figsize=(8,6))
sns.scatterplot(x='petal length (cm)', y='petal width (cm)', hue='cluster_k3',
                palette='Set1', data=iris_df)
plt.title('K-Means Clusters (k=3)')
plt.show()

In [None]:
# 2d: Cluster Analysis (Sample 150-200 words)
# ----------------------------
cluster_analysis = """
The K-Means clustering with k=3 successfully identified the three natural species clusters within the Iris dataset. 
The Adjusted Rand Index (ARI) indicates a strong alignment with actual species, suggesting high-quality clustering. 
Some misclassifications occur mainly between 'versicolor' and 'virginica', likely due to overlapping feature values in petal length and width. 
The elbow curve confirms k=3 as an optimal number of clusters, with diminishing inertia reduction for higher k. 
Visualizing clusters using petal length and width shows clear separation of 'setosa', while the other two species slightly overlap. 
In real-world applications, such clustering techniques are useful for customer segmentation, product categorization, and market analysis. 
If synthetic data were used, results might differ due to noise and variations in generated features, potentially affecting cluster purity and ARI scores.
"""
print(cluster_analysis)

# Task 3: Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print("Decision Tree Classification Report:\n")
print(classification_report(y_test, y_pred_dt, target_names=label_encoder.classes_))

In [None]:
plt.figure(figsize=(12,8))
plot_tree(dt_model, feature_names=X_train.columns, class_names=label_encoder.classes_, filled=True)
plt.title("Decision Tree for Iris Dataset")
plt.show()

In [None]:
# KNN (k=5)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

print("KNN Classification Report:\n")
print(classification_report(y_test, y_pred_knn, target_names=label_encoder.classes_))

# Compare accuracies
dt_accuracy = accuracy_score(y_test, y_pred_dt)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
print(f"KNN Accuracy: {knn_accuracy:.4f}")

In [None]:
# =========================================
# Task 3: Association Rule Mining
# =========================================

import random
from mlxtend.frequent_patterns import apriori, association_rules

# Generate synthetic transactions
items_pool = ['milk','bread','beer','diapers','eggs','cheese','butter','coffee','tea','sugar',
              'apples','bananas','chocolate','cereal','yogurt','ham','juice','water','chips','cookies']

transactions = [random.sample(items_pool, k=random.randint(3,8)) for _ in range(30)]

# One-hot encode
all_items = sorted(items_pool)
encoded_vals = []
for t in transactions:
    row = {item: (item in t) for item in all_items}
    encoded_vals.append(row)
df_transactions = pd.DataFrame(encoded_vals)

# Apriori
frequent_itemsets = apriori(df_transactions, min_support=0.2, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)

# Top 5 rules by lift
rules_sorted = rules.sort_values(by='lift', ascending=False)
print("Top 5 Association Rules:\n", rules_sorted.head())

# Analyze one rule
sample_rule = rules_sorted.iloc[0]
print("\nSample Rule Analysis:")
print(f"Rule: {sample_rule['antecedents']} -> {sample_rule['consequents']}")
print(f"Support: {sample_rule['support']:.2f}, Confidence: {sample_rule['confidence']:.2f}, Lift: {sample_rule['lift']:.2f}")