In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load dataset for demonstration
data = load_iris()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define and train a model for confusion matrix example
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Q1: Purpose and Working of Grid Search CV
grid_search_cv_explanation = """
**Purpose**: Grid Search Cross-Validation (CV) is used to find the best combination of hyperparameters for a given model by evaluating different parameter values through cross-validation.

**How it Works**:
1. **Define Parameter Grid**: Specify a grid of hyperparameters to search through.
2. **Cross-Validation**: For each combination of parameters, the model is evaluated using cross-validation.
3. **Select Best Model**: The combination of parameters that yields the best performance is selected based on a chosen metric (e.g., accuracy, F1-score).
"""

# Q2: Difference Between Grid Search CV and Randomized Search CV
search_comparison = """
**Grid Search CV**:
- **Description**: Exhaustively searches through a specified set of hyperparameters.
- **Pros**: Thorough and guarantees finding the best combination within the grid.
- **Cons**: Computationally expensive and time-consuming, especially with a large parameter grid.

**Randomized Search CV**:
- **Description**: Randomly samples a subset of hyperparameters from a specified distribution.
- **Pros**: More efficient and faster, especially when the parameter space is large.
- **Cons**: May not find the optimal combination if the sample size is small.

**When to Choose**:
- **Grid Search CV**: When you have a small parameter space and need a thorough search.
- **Randomized Search CV**: When you have a large parameter space or limited computational resources.
"""

# Q3: Data Leakage
data_leakage_explanation = """
**Data Leakage**:
- **Definition**: Data leakage occurs when information from outside the training dataset is used to create the model, leading to overly optimistic performance estimates.

**Example**:
- If a feature that includes information about the target variable (e.g., future values) is used in training, the model may perform well during training but poorly on unseen data.

**Why It Is a Problem**:
- Leads to overfitting and inaccurate performance metrics.
"""

# Q4: Preventing Data Leakage
data_leakage_prevention = """
**Preventing Data Leakage**:
1. **Proper Data Splitting**: Ensure that the training and test datasets are split properly and independently.
2. **Feature Engineering**: Avoid using information from the future or target-related features in the training phase.
3. **Cross-Validation**: Use cross-validation properly to ensure the model is tested on unseen data.
4. **Pipeline**: Use pipelines to ensure that all preprocessing steps are applied correctly and consistently.
"""

# Q5: Confusion Matrix
confusion_matrix_explanation = """
**Confusion Matrix**:
- **Definition**: A table that is used to evaluate the performance of a classification model by comparing predicted labels to true labels.

**Components**:
- **True Positive (TP)**: Correctly predicted positive instances.
- **True Negative (TN)**: Correctly predicted negative instances.
- **False Positive (FP)**: Incorrectly predicted positive instances.
- **False Negative (FN)**: Incorrectly predicted negative instances.

**What It Tells You**:
- Provides insight into the types of errors the model is making and how well it is performing across different classes.
"""

# Q6: Precision vs. Recall
precision_recall_explanation = """
**Precision**:
- **Definition**: The ratio of correctly predicted positive observations to the total predicted positives. 
- **Formula**: Precision = TP / (TP + FP)

**Recall**:
- **Definition**: The ratio of correctly predicted positive observations to all observations in the actual class.
- **Formula**: Recall = TP / (TP + FN)

**Difference**:
- **Precision** focuses on the accuracy of positive predictions, while **Recall** focuses on the model's ability to identify all relevant instances.
"""

# Q7: Interpreting Confusion Matrix
confusion_matrix_interpretation = """
**Interpreting Confusion Matrix**:
- **True Positives (TP)**: The number of correctly identified positives.
- **True Negatives (TN)**: The number of correctly identified negatives.
- **False Positives (FP)**: The number of incorrectly identified positives (Type I error).
- **False Negatives (FN)**: The number of incorrectly identified negatives (Type II error).

**Determining Errors**:
- High FP indicates many false positives, suggesting the model is too aggressive.
- High FN indicates many false negatives, suggesting the model is missing relevant instances.
"""

# Q8: Metrics from Confusion Matrix
metrics_from_confusion_matrix = """
**Common Metrics**:
1. **Accuracy**: (TP + TN) / (TP + TN + FP + FN)
2. **Precision**: TP / (TP + FP)
3. **Recall**: TP / (TP + FN)
4. **F1 Score**: 2 * (Precision * Recall) / (Precision + Recall)

**Calculation**:
- These metrics provide a summary of model performance and can be derived from the values in the confusion matrix.
"""

# Q9: Accuracy and Confusion Matrix
accuracy_confusion_matrix_relationship = """
**Relationship**:
- **Accuracy**: (TP + TN) / (TP + TN + FP + FN)
- Reflects the overall correctness of the model. However, accuracy can be misleading in imbalanced datasets.
"""

# Q10: Using Confusion Matrix to Identify Biases
confusion_matrix_bias_identification = """
**Identifying Biases**:
- **Class Imbalance**: A high number of FP or FN can indicate a bias towards a certain class.
- **Performance Across Classes**: Look at the performance metrics for each class to determine if the model is biased towards certain classes.

**Example**:
- If the model is performing well for one class but poorly for another, this may indicate that the model needs more balanced training data or improved feature engineering.
"""

# Display confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=[0, 1, 2])
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
classes = data.target_names
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.tight_layout()

# Display results
print("Q1: Purpose and Working of Grid Search CV")
print(grid_search_cv_explanation)

print("\nQ2: Difference Between Grid Search CV and Randomized Search CV")
print(search_comparison)

print("\nQ3: Data Leakage")
print(data_leakage_explanation)

print("\nQ4: Preventing Data Leakage")
print(data_leakage_prevention)

print("\nQ5: Confusion Matrix")
print(confusion_matrix_explanation)

print("\nQ6: Precision vs. Recall")
print(precision_recall_explanation)

print("\nQ7: Interpreting Confusion Matrix")
print(confusion_matrix_interpretation)

print("\nQ8: Metrics from Confusion Matrix")
print(metrics_from_confusion_matrix)

print("\nQ9: Accuracy and Confusion Matrix")
print(accuracy_confusion_matrix_relationship)

print("\nQ10: Using Confusion Matrix to Identify Biases")
print(confusion_matrix_bias_identification)
