In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Q1: Difference between Linear Regression and Logistic Regression
linear_vs_logistic_regression = """
**Linear Regression**:
- **Purpose**: Predicts a continuous dependent variable based on one or more independent variables.
- **Output**: Continuous values.
- **Example**: Predicting house prices based on features like size, location, and number of rooms.

**Logistic Regression**:
- **Purpose**: Predicts the probability of a binary outcome (class) based on one or more independent variables.
- **Output**: Probability between 0 and 1, which is used to classify into one of two classes.
- **Example**: Predicting whether a customer will buy a product (Yes/No) based on their age, income, and browsing history.
"""

# Q2: Cost Function in Logistic Regression and Optimization
logistic_cost_function = """
**Cost Function**:
- **Cost Function**: Logistic Regression uses the **Log-Loss** or **Binary Cross-Entropy** as its cost function.
- **Formula**: Cost = - (y * log(p) + (1 - y) * log(1 - p)), where y is the actual label and p is the predicted probability.

**Optimization**:
- **Optimization Technique**: The cost function is minimized using optimization algorithms such as Gradient Descent.
- **Gradient Descent**: Iteratively updates the model parameters to find the minimum cost.
"""

# Q3: Regularization in Logistic Regression
regularization_logistic_regression = """
**Regularization**:
- **Purpose**: To prevent overfitting by adding a penalty to the cost function for large coefficients.
- **Types**:
  - **L1 Regularization (Lasso)**: Adds the absolute value of the coefficients to the cost function. Helps with feature selection.
  - **L2 Regularization (Ridge)**: Adds the squared value of the coefficients to the cost function. Helps to shrink the coefficients.

**Impact**:
- Reduces the model complexity and improves generalization to unseen data.
"""

# Q4: ROC Curve in Logistic Regression
roc_curve_explanation = """
**ROC Curve (Receiver Operating Characteristic Curve)**:
- **Purpose**: To evaluate the performance of a binary classification model by plotting the true positive rate (sensitivity) against the false positive rate (1 - specificity).
- **AUC (Area Under the Curve)**: Represents the overall performance. AUC of 1 indicates a perfect model, and AUC of 0.5 indicates a random model.

**How to Use**:
- **Calculate ROC Curve**: Plot the curve based on the true positive rate and false positive rate at various thresholds.
- **Evaluate Performance**: Higher AUC values represent better model performance.
"""

# Q5: Feature Selection Techniques in Logistic Regression
feature_selection_techniques = """
**Common Techniques**:
1. **Recursive Feature Elimination (RFE)**: Iteratively builds the model and removes the weakest features.
2. **L1 Regularization**: Lasso regularization can shrink some coefficients to zero, effectively performing feature selection.
3. **Univariate Selection**: Select features based on statistical tests (e.g., chi-squared test).
4. **Feature Importance from Tree-Based Models**: Use models like Random Forest to rank features by importance.

**Impact**:
- Improves model performance by removing irrelevant or redundant features.
- Reduces overfitting and enhances model interpretability.
"""

# Q6: Handling Imbalanced Datasets in Logistic Regression
imbalanced_datasets_strategies = """
**Strategies**:
1. **Resampling Techniques**:
   - **Oversampling**: Increase the number of instances in the minority class (e.g., SMOTE).
   - **Undersampling**: Decrease the number of instances in the majority class.

2. **Class Weights**:
   - **Adjust Class Weights**: Increase the weight of the minority class in the loss function to penalize misclassifications more heavily.

3. **Ensemble Methods**:
   - **Balanced Random Forest**: Combine multiple balanced trees to improve performance on imbalanced data.

4. **Threshold Adjustment**:
   - **Adjust Decision Threshold**: Change the threshold for classification to better balance precision and recall.
"""

# Q7: Common Issues in Logistic Regression
logistic_regression_issues = """
**Common Issues and Challenges**:
1. **Multicollinearity**:
   - **Problem**: High correlation between independent variables can make it difficult to determine the individual effect of each variable.
   - **Solution**: Use techniques such as Variance Inflation Factor (VIF) to detect multicollinearity and consider removing or combining correlated features.

2. **Overfitting**:
   - **Problem**: The model may fit the training data too closely and perform poorly on new data.
   - **Solution**: Use regularization techniques (L1/L2), cross-validation, and feature selection to address overfitting.

3. **Imbalanced Data**:
   - **Problem**: Class imbalance can lead to biased predictions.
   - **Solution**: Apply resampling techniques, adjust class weights, or use different evaluation metrics.
"""

# Example of ROC Curve using synthetic data
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

# Predict probabilities
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Display results
print("Q1: Difference between Linear Regression and Logistic Regression")
print(linear_vs_logistic_regression)

print("\nQ2: Cost Function in Logistic Regression and Optimization")
print(logistic_cost_function)

print("\nQ3: Regularization in Logistic Regression")
print(regularization_logistic_regression)

print("\nQ4: ROC Curve in Logistic Regression")
print(roc_curve_explanation)

print("\nQ5: Feature Selection Techniques in Logistic Regression")
print(feature_selection_techniques)

print("\nQ6: Handling Imbalanced Datasets in Logistic Regression")
print(imbalanced_datasets_strategies)

print("\nQ7: Common Issues in Logistic Regression")
print(logistic_regression_issues)
