In [2]:
# ============ COMPLETE SOLUTION WITH ALL IMPORTS ============
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# ============ TASK 3A: CLASSIFICATION ============
def perform_classification(X_train, X_test, y_train, y_test):
    """Complete classification task for Iris dataset"""
    print("\n=== CLASSIFICATION ANALYSIS ===")
    
    # 1. Decision Tree Classifier
    print("\n1. Decision Tree Classifier:")
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    y_pred_dt = dt.predict(X_test)
    
    # Visualization
    plt.figure(figsize=(20,10))
    plot_tree(dt, feature_names=X_train.columns, 
              class_names=y_train.unique(), filled=True)
    plt.savefig('decision_tree.png', bbox_inches='tight')
    plt.close()
    print("- Saved decision tree visualization as decision_tree.png")
    
    # 2. KNN Classifier
    print("\n2. K-Nearest Neighbors (k=5):")
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    
    # 3. Performance Comparison
    print("\nDecision Tree Performance:")
    print(classification_report(y_test, y_pred_dt))
    
    print("\nKNN Performance:")
    print(classification_report(y_test, y_pred_knn))
    
    # Determine which performs better
    if accuracy_score(y_test, y_pred_dt) > accuracy_score(y_test, y_pred_knn):
        print("\nBest model: Decision Tree")
    else:
        print("\nBest model: KNN")

# ============ TASK 3B: ASSOCIATION RULE MINING ============ 
def generate_transactions():
    """Generate synthetic market basket data"""
    items = ['milk', 'bread', 'eggs', 'diapers', 'beer', 
             'cheese', 'wine', 'fruit', 'vegetables']
    
    # Create transactions with some patterns
    transactions = []
    for _ in range(100):
        # Base transaction
        transaction = random.sample(items, k=random.randint(2,5))
        
        # Add some common patterns
        if random.random() > 0.7:
            transaction.extend(['milk', 'bread'])
        if random.random() > 0.8:
            transaction.extend(['beer', 'diapers'])
            
        transactions.append(list(set(transaction)))  # Remove duplicates
    
    return transactions

def perform_association_mining():
    """Perform association rule mining"""
    print("\n=== ASSOCIATION RULE MINING ===")
    
    # Generate transaction data
    transactions = generate_transactions()
    print(f"Generated {len(transactions)} transactions")
    
    # Transform to one-hot encoded format
    te = TransactionEncoder()
    te_ary = te.fit(transactions).transform(transactions)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    
    # Find frequent itemsets
    frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)
    
    # Generate association rules
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
    rules = rules.sort_values('lift', ascending=False)
    
    # Save top 5 rules
    top_rules = rules.head(5)
    print("\nTop 5 Association Rules:")
    print(top_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
    
    # Save to CSV
    top_rules.to_csv('association_rules.csv', index=False)
    print("- Saved rules as association_rules.csv")
    
    # Analyze strongest rule
    best_rule = rules.iloc[0]
    print(f"\nStrongest rule: {list(best_rule['antecedents'])} → {list(best_rule['consequents'])}")
    print(f"Lift: {best_rule['lift']:.2f} (Confidence: {best_rule['confidence']:.2f})")
    print("Business implication: These items should be placed together in store")

# ============ MAIN EXECUTION ============
if __name__ == "__main__":
    # Load and preprocess Iris data
    iris = load_iris()
    df = pd.DataFrame(iris.data, columns=iris.feature_names)
    df['species'] = iris.target_names[iris.target]
    
    # Normalize features
    scaler = MinMaxScaler()
    df[iris.feature_names] = scaler.fit_transform(df[iris.feature_names])
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        df[iris.feature_names], df['species'], test_size=0.2, random_state=42, stratify=df['species']
    )
    
    # Run classification
    perform_classification(X_train, X_test, y_train, y_test)
    
    # Run association mining
    perform_association_mining()
    
    print("\n=== ALL TASKS COMPLETED ===")
    print("Generated files:")
    print("- decision_tree.png (classification)")
    print("- association_rules.csv (market basket analysis)")
    print("\nTo install all required packages:")
    print("pip install pandas scikit-learn matplotlib mlxtend")


=== CLASSIFICATION ANALYSIS ===

1. Decision Tree Classifier:
- Saved decision tree visualization as decision_tree.png

2. K-Nearest Neighbors (k=5):

Decision Tree Performance:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.90      0.90      0.90        10
   virginica       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30


KNN Performance:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.91      1.00      0.95        10
   virginica       1.00      0.90      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30


Best model: KNN

=== ASSOCIATION RULE MI