## Part 1: Data Loading and Preprocessing

In [None]:
# Import required libraries for Exercise 1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

print("="*70)
print("EXERCISE 1: IMDB SENTIMENT ANALYSIS")
print("="*70)
print("✓ All libraries imported successfully!")
print("✓ NLTK data downloaded successfully!")

In [None]:
# Load the IMDB dataset
df_imdb = pd.read_csv('IMDB_Dataset.csv')

print("Dataset loaded successfully!")
print(f"\nDataset Shape: {df_imdb.shape}")
print(f"\nColumn Names: {df_imdb.columns.tolist()}")
print(f"\nFirst 5 rows:")
display(df_imdb.head())

In [None]:
# Basic dataset information
print("Dataset Information:")
print("=" * 60)
print(df_imdb.info())

print("\nMissing Values:")
print("=" * 60)
print(df_imdb.isnull().sum())

print("\nSentiment Distribution:")
print("=" * 60)
print(df_imdb['sentiment'].value_counts())
print(f"\nPercentage Distribution:")
print(df_imdb['sentiment'].value_counts(normalize=True) * 100)

In [None]:
# Visualize sentiment distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
sns.countplot(data=df_imdb, x='sentiment', palette='Set2', ax=axes[0])
axes[0].set_title('Sentiment Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Sentiment', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
for container in axes[0].containers:
    axes[0].bar_label(container)

# Pie chart
sentiment_counts = df_imdb['sentiment'].value_counts()
axes[1].pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%',
            startangle=90, colors=['#ff9999', '#66b3ff'])
axes[1].set_title('Sentiment Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    """
    Preprocess text according to requirements:
    1. Convert to lowercase
    2. Remove non-alphabetic characters (punctuation)
    3. Tokenize and remove stopwords
    4. Apply stemming
    """
    # Step 1: Convert to lowercase
    text = text.lower()
    
    # Step 2: Remove non-alphabetic characters (keep only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Step 3: Tokenize
    tokens = text.split()
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    # Step 4: Apply stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    return ' '.join(tokens)

print("Preprocessing function created successfully!")
print("\nExample preprocessing:")
sample_text = "This movie was AMAZING! I loved it so much. Best film ever!!!"
print(f"Original: {sample_text}")
print(f"Processed: {preprocess_text(sample_text)}")

In [None]:
# Apply preprocessing to all reviews
print("Preprocessing all reviews...")
df_imdb['processed_review'] = df_imdb['review'].apply(preprocess_text)

print("\n✓ Preprocessing completed!")
print("\nComparison of Original vs Processed Reviews:")
print("=" * 80)

for i in range(2):
    print(f"\nExample {i+1}:")
    print(f"Original ({len(df_imdb['review'].iloc[i])} chars):")
    print(df_imdb['review'].iloc[i][:200] + "...")
    print(f"\nProcessed ({len(df_imdb['processed_review'].iloc[i])} chars):")
    print(df_imdb['processed_review'].iloc[i][:200] + "...")
    print("-" * 80)

### Step 2: Split Dataset (80% Training, 20% Testing)

In [None]:
# Prepare features and labels
X_imdb = df_imdb['processed_review']
y_imdb = df_imdb['sentiment'].map({'negative': 0, 'positive': 1})

# Split dataset: 80% training, 20% testing
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = train_test_split(
    X_imdb, y_imdb, test_size=0.2, random_state=42, stratify=y_imdb
)

print("Dataset Split Summary:")
print("=" * 60)
print(f"Total samples: {len(df_imdb)}")
print(f"Training samples: {len(X_train_imdb)} ({len(X_train_imdb)/len(df_imdb)*100:.1f}%)")
print(f"Testing samples: {len(X_test_imdb)} ({len(X_test_imdb)/len(df_imdb)*100:.1f}%)")

print("\nTraining Set Sentiment Distribution:")
print(y_train_imdb.value_counts())
print(f"\nTesting Set Sentiment Distribution:")
print(y_test_imdb.value_counts())

### Step 3: Implement Bag-of-Words Model and Train Naive Bayes Classifier

In [None]:
# Step 3a: Implement Bag-of-Words model using CountVectorizer
print("Creating Bag-of-Words model...")

# Initialize CountVectorizer
vectorizer = CountVectorizer(max_features=5000)

# Fit and transform training data
X_train_bow = vectorizer.fit_transform(X_train_imdb)

# Transform test data
X_test_bow = vectorizer.transform(X_test_imdb)

print("\n✓ Bag-of-Words model created successfully!")
print("\nBag-of-Words Statistics:")
print("=" * 60)
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"Training matrix shape: {X_train_bow.shape}")
print(f"Testing matrix shape: {X_test_bow.shape}")
print(f"Matrix density: {X_train_bow.nnz / (X_train_bow.shape[0] * X_train_bow.shape[1]) * 100:.2f}%")

In [None]:
# Step 3b: Train Naive Bayes classifier
print("Training Naive Bayes classifier...")

# Initialize and train the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bow, y_train_imdb)

print("\n✓ Naive Bayes classifier trained successfully!")
print("\nModel Parameters:")
print("=" * 60)
print(f"Number of classes: {len(nb_classifier.classes_)}")
print(f"Classes: {nb_classifier.classes_}")
print(f"Number of features: {nb_classifier.n_features_in_}")

## Part 2: Model Evaluation

In [None]:
# Make predictions
print("Making predictions...")
y_pred_nb = nb_classifier.predict(X_test_bow)
y_pred_proba_nb = nb_classifier.predict_proba(X_test_bow)[:, 1]

print("✓ Predictions completed!")

### Evaluation Metric 1: Accuracy

In [None]:
# Calculate accuracy
accuracy_nb = accuracy_score(y_test_imdb, y_pred_nb)

print("=" * 60)
print("ACCURACY")
print("=" * 60)
print(f"Accuracy Score: {accuracy_nb:.4f} ({accuracy_nb*100:.2f}%)")
print(f"\nCorrectly classified: {(y_test_imdb == y_pred_nb).sum()} out of {len(y_test_imdb)}")
print(f"Misclassified: {(y_test_imdb != y_pred_nb).sum()} out of {len(y_test_imdb)}")

### Evaluation Metric 2: Precision, Recall, and F1-Score

In [None]:
# Calculate precision, recall, and F1-score
precision_nb = precision_score(y_test_imdb, y_pred_nb)
recall_nb = recall_score(y_test_imdb, y_pred_nb)
f1_nb = f1_score(y_test_imdb, y_pred_nb)

print("=" * 60)
print("PRECISION, RECALL, AND F1-SCORE")
print("=" * 60)
print(f"Precision: {precision_nb:.4f}")
print(f"Recall:    {recall_nb:.4f}")
print(f"F1-Score:  {f1_nb:.4f}")

print("\n" + "=" * 60)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 60)
print(classification_report(y_test_imdb, y_pred_nb, 
                          target_names=['Negative (0)', 'Positive (1)'],
                          digits=4))

### Evaluation Metric 3: Confusion Matrix

In [None]:
# Calculate confusion matrix
cm_nb = confusion_matrix(y_test_imdb, y_pred_nb)

print("=" * 60)
print("CONFUSION MATRIX")
print("=" * 60)
print(cm_nb)
print("\nConfusion Matrix Breakdown:")
print(f"True Negatives (TN):  {cm_nb[0, 0]}")
print(f"False Positives (FP): {cm_nb[0, 1]}")
print(f"False Negatives (FN): {cm_nb[1, 0]}")
print(f"True Positives (TP):  {cm_nb[1, 1]}")

In [None]:
# Visualize confusion matrix
plt.figure(figsize=(10, 8))

sns.heatmap(cm_nb, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative (0)', 'Positive (1)'],
            yticklabels=['Negative (0)', 'Positive (1)'],
            cbar_kws={'label': 'Count'},
            annot_kws={'size': 16, 'weight': 'bold'})

plt.title('Confusion Matrix - Naive Bayes Classifier', 
          fontsize=16, fontweight='bold', pad=20)
plt.ylabel('Actual Sentiment', fontsize=13, fontweight='bold')
plt.xlabel('Predicted Sentiment', fontsize=13, fontweight='bold')

plt.tight_layout()
plt.show()

### Evaluation Metric 4: ROC-AUC Score

In [None]:
# Calculate ROC-AUC score
roc_auc_nb = roc_auc_score(y_test_imdb, y_pred_proba_nb)

print("=" * 60)
print("ROC-AUC SCORE")
print("=" * 60)
print(f"ROC-AUC Score: {roc_auc_nb:.4f}")

In [None]:
# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(y_test_imdb, y_pred_proba_nb)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, 
         label=f'ROC curve (AUC = {roc_auc_nb:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', 
         label='Random Classifier (AUC = 0.5)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=13, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=13, fontweight='bold')
plt.title('ROC Curve - Naive Bayes Classifier', fontsize=16, fontweight='bold')
plt.legend(loc="lower right", fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Summary of Exercise 1

In [None]:
# Create comprehensive summary
summary_ex1 = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'],
    'Score': [accuracy_nb, precision_nb, recall_nb, f1_nb, roc_auc_nb]
})

print("=" * 70)
print("EXERCISE 1 - FINAL PERFORMANCE SUMMARY")
print("=" * 70)
print(summary_ex1.to_string(index=False))

print("\n" + "=" * 70)
print("KEY INSIGHTS")
print("=" * 70)
print(f"• The Naive Bayes model achieved {accuracy_nb*100:.2f}% accuracy")
print(f"• Precision: {precision_nb*100:.2f}% of positive predictions were correct")
print(f"• Recall: {recall_nb*100:.2f}% of positive reviews were identified")
print(f"• ROC-AUC: {roc_auc_nb:.4f} indicates strong discrimination ability")
print("=" * 70)

---
---

# Exercise 3: Feature Selection using Wrapper Methods

## Objective
Identify the most important features in predicting breast cancer prognosis using the Breast Cancer Dataset. Apply Recursive Feature Elimination (RFE) wrapper method to select the best features and evaluate model performance.

---

## Part 1: Data Loading and Preprocessing

In [None]:
# Import additional libraries for Exercise 3
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Set random seed for reproducibility
np.random.seed(42)

print("="*70)
print("EXERCISE 3: FEATURE SELECTION WITH RFE")
print("="*70)
print("✓ Additional libraries imported successfully!")

### Step 1: Load the Dataset

In [None]:
# Load the Breast Cancer dataset
df_cancer = pd.read_csv('Breast_Cancer_Dataset.csv')

print("Dataset loaded successfully!")
print(f"\nDataset Shape: {df_cancer.shape}")
print(f"Number of samples: {df_cancer.shape[0]}")
print(f"Number of features: {df_cancer.shape[1] - 1}")
print(f"\nFirst 5 rows:")
display(df_cancer.head())

### Step 2: Exploratory Data Analysis (EDA)

In [None]:
# Dataset Information
print("Dataset Information:")
print("=" * 70)
print(df_cancer.info())

print("\n" + "=" * 70)
print("Dataset Shape:")
print("=" * 70)
print(f"Rows: {df_cancer.shape[0]}")
print(f"Columns: {df_cancer.shape[1]}")
print(f"Features (excluding target): {df_cancer.shape[1] - 1}")

In [None]:
# Check for missing values
print("Missing Values Analysis:")
print("=" * 70)
missing_values = df_cancer.isnull().sum()
if missing_values.sum() == 0:
    print("✓ No missing values found in the dataset!")
else:
    print("Missing values per column:")
    print(missing_values[missing_values > 0])

In [None]:
# Target variable analysis
print("Target Variable Analysis (Diagnosis):")
print("=" * 70)
print(df_cancer['diagnosis'].value_counts())
print(f"\nClass Distribution:")
print(f"Malignant (0): {(df_cancer['diagnosis'] == 0).sum()} ({(df_cancer['diagnosis'] == 0).sum()/len(df_cancer)*100:.2f}%)")
print(f"Benign (1):    {(df_cancer['diagnosis'] == 1).sum()} ({(df_cancer['diagnosis'] == 1).sum()/len(df_cancer)*100:.2f}%)")

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

target_counts = df_cancer['diagnosis'].value_counts()
axes[0].bar(['Malignant (0)', 'Benign (1)'], target_counts.values, 
            color=['#ff6b6b', '#4ecdc4'], alpha=0.7, edgecolor='black')
axes[0].set_ylabel('Count', fontsize=12, fontweight='bold')
axes[0].set_title('Diagnosis Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3, axis='y')
for i, v in enumerate(target_counts.values):
    axes[0].text(i, v, str(v), ha='center', va='bottom', fontweight='bold')

axes[1].pie(target_counts.values, labels=['Malignant (0)', 'Benign (1)'],
            autopct='%1.1f%%', startangle=90, colors=['#ff6b6b', '#4ecdc4'])
axes[1].set_title('Diagnosis Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Statistical summary of features
print("Statistical Summary of Features:")
print("=" * 70)
display(df_cancer.describe())

### Step 3: Split Dataset (80% Training, 20% Testing)

In [None]:
# Separate features and target
X_cancer = df_cancer.drop('diagnosis', axis=1)
y_cancer = df_cancer['diagnosis']

# Split dataset: 80% training, 20% testing
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(
    X_cancer, y_cancer, test_size=0.2, random_state=42, stratify=y_cancer
)

print("Dataset Split Summary:")
print("=" * 70)
print(f"Total samples: {len(df_cancer)}")
print(f"Number of features: {X_cancer.shape[1]}")
print(f"\nTraining samples: {len(X_train_cancer)} ({len(X_train_cancer)/len(df_cancer)*100:.1f}%)")
print(f"Testing samples:  {len(X_test_cancer)} ({len(X_test_cancer)/len(df_cancer)*100:.1f}%)")

print("\nTraining Set Class Distribution:")
print(f"Malignant (0): {(y_train_cancer == 0).sum()} ({(y_train_cancer == 0).sum()/len(y_train_cancer)*100:.1f}%)")
print(f"Benign (1):    {(y_train_cancer == 1).sum()} ({(y_train_cancer == 1).sum()/len(y_train_cancer)*100:.1f}%)")

In [None]:
# Feature Scaling (Important for Logistic Regression)
print("Applying Feature Scaling...")
print("=" * 70)

scaler = StandardScaler()
X_train_cancer_scaled = scaler.fit_transform(X_train_cancer)
X_test_cancer_scaled = scaler.transform(X_test_cancer)

print("✓ Features scaled successfully using StandardScaler!")
print(f"\nScaled training data shape: {X_train_cancer_scaled.shape}")
print(f"Scaled testing data shape:  {X_test_cancer_scaled.shape}")

## Part 2: Apply Wrapper Method (RFE)

### Step 1: Recursive Feature Elimination (RFE) - Select Top 5 Features

In [None]:
# Initialize Logistic Regression model for RFE
print("Applying Recursive Feature Elimination (RFE)...")
print("=" * 70)

# Create base estimator
base_estimator = LogisticRegression(max_iter=10000, random_state=42)

# Create RFE object to select top 5 features
n_features_to_select = 5
rfe = RFE(estimator=base_estimator, n_features_to_select=n_features_to_select)

# Fit RFE
rfe.fit(X_train_cancer_scaled, y_train_cancer)

print(f"✓ RFE completed successfully!")
print(f"\nNumber of features selected: {n_features_to_select}")
print(f"Total features: {X_train_cancer.shape[1]}")

In [None]:
# Get feature rankings and selected features
feature_names = X_cancer.columns.tolist()
feature_ranking = pd.DataFrame({
    'Feature': feature_names,
    'Ranking': rfe.ranking_,
    'Selected': rfe.support_
}).sort_values('Ranking')

print("Feature Ranking by RFE:")
print("=" * 70)
print(feature_ranking.to_string(index=False))

selected_features = feature_ranking[feature_ranking['Selected'] == True]['Feature'].tolist()
print(f"\n✓ Top {n_features_to_select} Selected Features:")
print("=" * 70)
for i, feat in enumerate(selected_features, 1):
    print(f"{i}. {feat}")

### Step 2: Visualize Feature Rankings

In [None]:
# Visualize feature rankings
plt.figure(figsize=(12, 10))

# Create color map
colors = ['green' if selected else 'red' for selected in feature_ranking['Selected']]

plt.barh(feature_ranking['Feature'], feature_ranking['Ranking'], color=colors, alpha=0.7)
plt.xlabel('Ranking (1 = Most Important)', fontsize=12, fontweight='bold')
plt.ylabel('Features', fontsize=12, fontweight='bold')
plt.title('RFE Feature Ranking', fontsize=14, fontweight='bold', pad=20)
plt.gca().invert_yaxis()

from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='green', alpha=0.7, label='Selected'),
                   Patch(facecolor='red', alpha=0.7, label='Not Selected')]
plt.legend(handles=legend_elements, loc='lower right', fontsize=11)

plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

### Step 3: Train Model with Selected Features

In [None]:
# Extract selected features
X_train_selected = X_train_cancer_scaled[:, rfe.support_]
X_test_selected = X_test_cancer_scaled[:, rfe.support_]

print(f"Training model with {n_features_to_select} selected features...")
print("=" * 70)
print(f"Selected features shape (train): {X_train_selected.shape}")
print(f"Selected features shape (test):  {X_test_selected.shape}")

# Train Logistic Regression with selected features
lr_selected = LogisticRegression(max_iter=10000, random_state=42)
lr_selected.fit(X_train_selected, y_train_cancer)

print("\n✓ Model trained successfully with selected features!")

## Part 3: Model Evaluation

In [None]:
# Make predictions with selected features model
y_pred_selected = lr_selected.predict(X_test_selected)
y_pred_proba_selected = lr_selected.predict_proba(X_test_selected)[:, 1]

# Calculate metrics for selected features
acc_selected = accuracy_score(y_test_cancer, y_pred_selected)
prec_selected = precision_score(y_test_cancer, y_pred_selected)
rec_selected = recall_score(y_test_cancer, y_pred_selected)
f1_selected = f1_score(y_test_cancer, y_pred_selected)
roc_selected = roc_auc_score(y_test_cancer, y_pred_proba_selected)

print("=" * 70)
print(f"MODEL EVALUATION - SELECTED FEATURES ({n_features_to_select})")
print("=" * 70)
print(f"Accuracy:  {acc_selected:.4f}")
print(f"Precision: {prec_selected:.4f}")
print(f"Recall:    {rec_selected:.4f}")
print(f"F1-Score:  {f1_selected:.4f}")
print(f"ROC-AUC:   {roc_selected:.4f}")

print("\n" + "=" * 70)
print("DETAILED CLASSIFICATION REPORT - SELECTED FEATURES")
print("=" * 70)
print(classification_report(y_test_cancer, y_pred_selected, 
                          target_names=['Malignant (0)', 'Benign (1)'],
                          digits=4))

In [None]:
# Confusion Matrix for selected features
cm_selected = confusion_matrix(y_test_cancer, y_pred_selected)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_selected, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Malignant (0)', 'Benign (1)'],
            yticklabels=['Malignant (0)', 'Benign (1)'],
            annot_kws={'size': 14, 'weight': 'bold'})
plt.title(f'Confusion Matrix - Selected Features ({n_features_to_select})', 
          fontsize=14, fontweight='bold', pad=15)
plt.ylabel('Actual', fontsize=12, fontweight='bold')
plt.xlabel('Predicted', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

### Compare with Model Trained on All Features

In [None]:
# Train model with ALL features
print("Training model with ALL features...")
print("=" * 70)

lr_all = LogisticRegression(max_iter=10000, random_state=42)
lr_all.fit(X_train_cancer_scaled, y_train_cancer)

# Make predictions
y_pred_all = lr_all.predict(X_test_cancer_scaled)
y_pred_proba_all = lr_all.predict_proba(X_test_cancer_scaled)[:, 1]

# Calculate metrics for all features
acc_all = accuracy_score(y_test_cancer, y_pred_all)
prec_all = precision_score(y_test_cancer, y_pred_all)
rec_all = recall_score(y_test_cancer, y_pred_all)
f1_all = f1_score(y_test_cancer, y_pred_all)
roc_all = roc_auc_score(y_test_cancer, y_pred_proba_all)

print("✓ Model trained successfully with all features!")
print(f"\n{'='*70}")
print(f"MODEL EVALUATION - ALL FEATURES ({X_train_cancer.shape[1]})")
print("=" * 70)
print(f"Accuracy:  {acc_all:.4f}")
print(f"Precision: {prec_all:.4f}")
print(f"Recall:    {rec_all:.4f}")
print(f"F1-Score:  {f1_all:.4f}")
print(f"ROC-AUC:   {roc_all:.4f}")

### Performance Comparison

In [None]:
# Create comparison dataframe
comparison = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'],
    f'Selected ({n_features_to_select})': [acc_selected, prec_selected, rec_selected, f1_selected, roc_selected],
    f'All ({X_train_cancer.shape[1]})': [acc_all, prec_all, rec_all, f1_all, roc_all]
})
comparison['Difference'] = comparison[f'Selected ({n_features_to_select})'] - comparison[f'All ({X_train_cancer.shape[1]})']

print("=" * 80)
print("PERFORMANCE COMPARISON: SELECTED vs ALL FEATURES")
print("=" * 80)
print(comparison.to_string(index=False))

print("\n" + "=" * 80)
print("INTERPRETATION:")
print("=" * 80)
if comparison['Difference'].mean() >= 0:
    print("✓ Selected features perform BETTER or EQUAL to all features on average")
else:
    print("⚠ All features perform slightly better than selected features")
    
print(f"\nFeature Reduction: {X_train_cancer.shape[1]} → {n_features_to_select} features")
print(f"Reduction Rate: {(1 - n_features_to_select/X_train_cancer.shape[1])*100:.1f}%")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart comparison
metrics = comparison['Metric'].tolist()
x = np.arange(len(metrics))
width = 0.35

bars1 = axes[0].bar(x - width/2, comparison[f'Selected ({n_features_to_select})'], 
                    width, label=f'Selected ({n_features_to_select})', 
                    color='#3498db', alpha=0.8, edgecolor='black')
bars2 = axes[0].bar(x + width/2, comparison[f'All ({X_train_cancer.shape[1]})'], 
                    width, label=f'All ({X_train_cancer.shape[1]})', 
                    color='#e74c3c', alpha=0.8, edgecolor='black')

axes[0].set_ylabel('Score', fontsize=12, fontweight='bold')
axes[0].set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(metrics)
axes[0].legend(fontsize=11)
axes[0].set_ylim([0, 1.1])
axes[0].grid(True, alpha=0.3, axis='y')

# Difference chart
colors_diff = ['green' if x >= 0 else 'red' for x in comparison['Difference']]
bars = axes[1].bar(metrics, comparison['Difference'], color=colors_diff, alpha=0.7, edgecolor='black')
axes[1].set_ylabel('Difference (Selected - All)', fontsize=12, fontweight='bold')
axes[1].set_title('Performance Difference', fontsize=14, fontweight='bold')
axes[1].axhline(y=0, color='black', linestyle='-', linewidth=1)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Part 4: Experiment with Different Numbers of Features

In [None]:
# Experiment with different numbers of features
print("Experimenting with different numbers of selected features...")
print("=" * 70)

feature_counts = [3, 5, 7, 10, 15, 20]
results = []

for n_features in feature_counts:
    print(f"Testing with {n_features} features...")
    
    # Create and fit RFE
    rfe_temp = RFE(LogisticRegression(max_iter=10000, random_state=42), 
                   n_features_to_select=n_features)
    rfe_temp.fit(X_train_cancer_scaled, y_train_cancer)
    
    # Transform data
    X_train_temp = X_train_cancer_scaled[:, rfe_temp.support_]
    X_test_temp = X_test_cancer_scaled[:, rfe_temp.support_]
    
    # Train model
    lr_temp = LogisticRegression(max_iter=10000, random_state=42)
    lr_temp.fit(X_train_temp, y_train_cancer)
    
    # Predictions
    y_pred_temp = lr_temp.predict(X_test_temp)
    y_pred_proba_temp = lr_temp.predict_proba(X_test_temp)[:, 1]
    
    # Calculate metrics
    results.append({
        'Features': n_features,
        'Accuracy': accuracy_score(y_test_cancer, y_pred_temp),
        'Precision': precision_score(y_test_cancer, y_pred_temp),
        'Recall': recall_score(y_test_cancer, y_pred_temp),
        'F1-Score': f1_score(y_test_cancer, y_pred_temp),
        'ROC-AUC': roc_auc_score(y_test_cancer, y_pred_proba_temp)
    })

# Create results dataframe
results_df = pd.DataFrame(results)

print("\n" + "=" * 70)
print("EXPERIMENT RESULTS: DIFFERENT FEATURE COUNTS")
print("=" * 70)
print(results_df.to_string(index=False))

In [None]:
# Visualize experiment results
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Model Performance vs Number of Features', fontsize=16, fontweight='bold')

metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
colors_plot = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6']

for idx, (metric, color) in enumerate(zip(metrics_to_plot, colors_plot)):
    row = idx // 3
    col = idx % 3
    
    axes[row, col].plot(results_df['Features'], results_df[metric], 
                        marker='o', color=color, linewidth=2.5, markersize=10)
    axes[row, col].set_xlabel('Number of Features', fontsize=11, fontweight='bold')
    axes[row, col].set_ylabel(metric, fontsize=11, fontweight='bold')
    axes[row, col].set_title(f'{metric} vs Features', fontsize=12, fontweight='bold')
    axes[row, col].grid(True, alpha=0.3)
    axes[row, col].set_xticks(results_df['Features'])

# Hide the last subplot
axes[1, 2].axis('off')

plt.tight_layout()
plt.show()

## Feature selection

In [None]:
# Find optimal number of features
optimal_idx = results_df['F1-Score'].idxmax()
optimal_features = results_df.loc[optimal_idx]

print("=" * 80)
print("DISCUSSION: IMPACT OF FEATURE SELECTION")
print("=" * 80)

print("\n1. OPTIMAL FEATURE COUNT (Based on F1-Score):")
print("-" * 80)
print(f"   Optimal number of features: {int(optimal_features['Features'])}")
print(f"   F1-Score: {optimal_features['F1-Score']:.4f}")

print("\n2. BENEFITS OF FEATURE SELECTION:")
print("-" * 80)
print("   ✓ Reduced model complexity")
print("   ✓ Faster training and prediction times")
print("   ✓ Improved model interpretability")
print("   ✓ Reduced risk of overfitting")
print("   ✓ Better generalization to new data")

print("\n3. KEY OBSERVATIONS:")
print("-" * 80)
min_features = results_df.loc[results_df['Features'].idxmin()]
max_features = results_df.loc[results_df['Features'].idxmax()]
print(f"   • With {int(min_features['Features'])} features: F1 = {min_features['F1-Score']:.4f}")
print(f"   • With {int(max_features['Features'])} features: F1 = {max_features['F1-Score']:.4f}")
print(f"   • Feature reduction of {(1 - n_features_to_select/X_train_cancer.shape[1])*100:.1f}% maintains strong performance")

print("\n4. CONCLUSION:")
print("-" * 80)
print("   Feature selection using RFE effectively identifies the most important features")
print("   while maintaining or even improving model performance. The wrapper method")
print("   successfully balances model complexity and predictive accuracy.")
print("=" * 80)



# Duitai ko summary


In [None]:
print("="*80)
print("WORKSHEET 10 - FINAL SUMMARY")
print("="*80)

print("\n" + "="*80)
print("EXERCISE 1: IMDB SENTIMENT ANALYSIS (NAIVE BAYES)")
print("="*80)
print(f"Dataset: {len(df_imdb)} movie reviews")
print(f"Model: Multinomial Naive Bayes with Bag-of-Words")
print(f"\nPerformance:")
print(f"  • Accuracy:  {accuracy_nb:.4f} ({accuracy_nb*100:.2f}%)")
print(f"  • Precision: {precision_nb:.4f}")
print(f"  • Recall:    {recall_nb:.4f}")
print(f"  • F1-Score:  {f1_nb:.4f}")
print(f"  • ROC-AUC:   {roc_auc_nb:.4f}")

print("\n" + "="*80)
print("EXERCISE 3: BREAST CANCER FEATURE SELECTION (RFE)")
print("="*80)
print(f"Dataset: {len(df_cancer)} samples with {X_cancer.shape[1]} features")
print(f"Model: Logistic Regression with RFE")
print(f"Feature Reduction: {X_cancer.shape[1]} → {n_features_to_select} features ({(1-n_features_to_select/X_cancer.shape[1])*100:.1f}% reduction)")
print(f"\nPerformance with {n_features_to_select} selected features:")
print(f"  • Accuracy:  {acc_selected:.4f} ({acc_selected*100:.2f}%)")
print(f"  • Precision: {prec_selected:.4f}")
print(f"  • Recall:    {rec_selected:.4f}")
print(f"  • F1-Score:  {f1_selected:.4f}")
print(f"  • ROC-AUC:   {roc_selected:.4f}")

print(f"\nPerformance with all {X_cancer.shape[1]} features:")
print(f"  • Accuracy:  {acc_all:.4f} ({acc_all*100:.2f}%)")
print(f"  • F1-Score:  {f1_all:.4f}")

print("\n" + "="*80)
print("KEY LEARNINGS")
print("="*80)
print("1. Naive Bayes is effective for text classification tasks")
print("2. Proper text preprocessing improves model performance")
print("3. Feature selection (RFE) reduces complexity while maintaining accuracy")
print("4. Fewer features lead to more interpretable and efficient models")
print("5. Multiple evaluation metrics provide comprehensive performance insights")
print("="*80)