# Dimensionality Reduction and Model Evaluation with the Iris Dataset


In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('Data prepared and standardized.')

Data prepared and standardized.


### Step 2: Model Training & Evaluation (Without Dimensionality Reduction)
In this section, we will train and evaluate multiple machine learning models without applying any dimensionality reduction techniques.

In [2]:
# Import necessary libraries for models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Define models to train
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

# Function to train and evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    return accuracy, precision, recall, f1, cm

# Evaluate each model
results = {}
for name, model in models.items():
    accuracy, precision, recall, f1, cm = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test)
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': cm
    }

# Display the results
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Accuracy,Precision,Recall,F1-Score,Confusion Matrix
Logistic Regression,1.0,1.0,1.0,1.0,"[[19, 0, 0], [0, 13, 0], [0, 0, 13]]"
Decision Tree,1.0,1.0,1.0,1.0,"[[19, 0, 0], [0, 13, 0], [0, 0, 13]]"
Random Forest,1.0,1.0,1.0,1.0,"[[19, 0, 0], [0, 13, 0], [0, 0, 13]]"
SVM,1.0,1.0,1.0,1.0,"[[19, 0, 0], [0, 13, 0], [0, 0, 13]]"


### Step 3: Dimensionality Reduction with PCA
Now, we will apply PCA (Principal Component Analysis) to reduce the feature space to two dimensions and evaluate the model performance again.

In [3]:
# Import PCA
from sklearn.decomposition import PCA

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Evaluate models on PCA-reduced data
results_pca = {}
for name, model in models.items():
    accuracy, precision, recall, f1, cm = evaluate_model(model, X_train_pca, X_test_pca, y_train, y_test)
    results_pca[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': cm
    }

# Display PCA results
results_pca_df = pd.DataFrame(results_pca).T
results_pca_df

Unnamed: 0,Accuracy,Precision,Recall,F1-Score,Confusion Matrix
Logistic Regression,0.911111,0.91596,0.911111,0.910582,"[[19, 0, 0], [0, 10, 3], [0, 1, 12]]"
Decision Tree,0.933333,0.934656,0.933333,0.933235,"[[19, 0, 0], [0, 11, 2], [0, 1, 12]]"
Random Forest,0.955556,0.961481,0.955556,0.955291,"[[19, 0, 0], [0, 11, 2], [0, 0, 13]]"
SVM,0.933333,0.934656,0.933333,0.933235,"[[19, 0, 0], [0, 11, 2], [0, 1, 12]]"


### Step 4: Dimensionality Reduction with LDA
Next, we will apply LDA (Linear Discriminant Analysis) for dimensionality reduction and evaluate the models again.

In [4]:
# Import LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Apply LDA for dimensionality reduction
lda = LinearDiscriminantAnalysis(n_components=2)
X_train_lda = lda.fit_transform(X_train_scaled, y_train)
X_test_lda = lda.transform(X_test_scaled)

# Evaluate models on LDA-reduced data
results_lda = {}
for name, model in models.items():
    accuracy, precision, recall, f1, cm = evaluate_model(model, X_train_lda, X_test_lda, y_train, y_test)
    results_lda[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Confusion Matrix': cm
    }

# Display LDA results
results_lda_df = pd.DataFrame(results_lda).T
results_lda_df

Unnamed: 0,Accuracy,Precision,Recall,F1-Score,Confusion Matrix
Logistic Regression,1.0,1.0,1.0,1.0,"[[19, 0, 0], [0, 13, 0], [0, 0, 13]]"
Decision Tree,0.955556,0.955556,0.955556,0.955556,"[[19, 0, 0], [0, 12, 1], [0, 1, 12]]"
Random Forest,1.0,1.0,1.0,1.0,"[[19, 0, 0], [0, 13, 0], [0, 0, 13]]"
SVM,1.0,1.0,1.0,1.0,"[[19, 0, 0], [0, 13, 0], [0, 0, 13]]"


### Step 5: Comparison and Analysis
Now, we will compare the performance of the models across three scenarios:
1. Without dimensionality reduction
2. With PCA
3. With LDA

In [5]:
# Compare the results
comparison_df = pd.DataFrame({
    'Without Dimensionality Reduction': results_df['Accuracy'],
    'With PCA': results_pca_df['Accuracy'],
    'With LDA': results_lda_df['Accuracy']
})

comparison_df

Unnamed: 0,Without Dimensionality Reduction,With PCA,With LDA
Logistic Regression,1.0,0.911111,1.0
Decision Tree,1.0,0.933333,0.955556
Random Forest,1.0,0.955556,1.0
SVM,1.0,0.933333,1.0
