# Scikit-Learn Practice Questions

This notebook covers essential scikit-learn operations including train-test split, model training, evaluation, and various machine learning techniques.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.datasets import make_classification, make_regression, load_iris, load_boston
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")
print(f"Scikit-learn version: {__import__('sklearn').__version__}")

## 1. Train-Test Split Demonstration

Demonstrate various train-test split strategies and their importance in machine learning workflows.

In [None]:
# Create sample dataset for classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                          n_redundant=5, n_classes=3, random_state=42)

print("Dataset Information:")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Class distribution: {np.bincount(y)}")
print()

# Basic train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Basic Train-Test Split (80-20):")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Training set class distribution: {np.bincount(y_train)}")
print(f"Test set class distribution: {np.bincount(y_test)}")
print()

# Stratified split to maintain class proportions
X_train_strat, X_test_strat, y_train_strat, y_test_strat = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

print("Stratified Train-Test Split (80-20):")
print(f"Training set class distribution: {np.bincount(y_train_strat)}")
print(f"Test set class distribution: {np.bincount(y_test_strat)}")
print()

# Train-Validation-Test split (60-20-20)
X_temp, X_test_final, y_temp, y_test_final = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)  # 0.25 of 0.8 = 0.2

print("Train-Validation-Test Split (60-20-20):")
print(f"Training set: {X_train_final.shape[0]} samples ({X_train_final.shape[0]/len(X)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test_final.shape[0]} samples ({X_test_final.shape[0]/len(X)*100:.1f}%)")
print()

# Visualize the splits
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Original distribution
axes[0].bar(range(len(np.bincount(y))), np.bincount(y))
axes[0].set_title('Original Class Distribution')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')

# Basic split
width = 0.35
x = np.arange(len(np.unique(y)))
axes[1].bar(x - width/2, np.bincount(y_train), width, label='Train', alpha=0.7)
axes[1].bar(x + width/2, np.bincount(y_test), width, label='Test', alpha=0.7)
axes[1].set_title('Basic Split Distribution')
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Count')
axes[1].legend()

# Stratified split
axes[2].bar(x - width/2, np.bincount(y_train_strat), width, label='Train', alpha=0.7)
axes[2].bar(x + width/2, np.bincount(y_test_strat), width, label='Test', alpha=0.7)
axes[2].set_title('Stratified Split Distribution')
axes[2].set_xlabel('Class')
axes[2].set_ylabel('Count')
axes[2].legend()

plt.tight_layout()
plt.show()

# Demonstrate different random states
print("Effect of different random states:")
for i, random_state in enumerate([42, 100, 999]):
    X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(
        X, y, test_size=0.2, random_state=random_state)
    print(f"Random state {random_state}: Train class dist = {np.bincount(y_train_rs)}, "
          f"Test class dist = {np.bincount(y_test_rs)}")

## 2. Classification Example with Model Training

Build and evaluate classification models using the train-test split data.

In [None]:
# Use the iris dataset for a more realistic example
iris = load_iris()
X_iris, y_iris = iris.data, iris.target

print("Iris Dataset Information:")
print(f"Features: {iris.feature_names}")
print(f"Classes: {iris.target_names}")
print(f"Dataset shape: {X_iris.shape}")
print()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_iris, y_iris, test_size=0.3, stratify=y_iris, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print()

# Train multiple classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42)
}

results = {}

for name, clf in classifiers.items():
    # Train the model
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'model': clf,
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    print(f"{name} Accuracy: {accuracy:.4f}")

print()

# Detailed evaluation for the best model
best_model_name = max(results.keys(), key=lambda k: results[k]['accuracy'])
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

print(f"Best Model: {best_model_name}")
print("\nClassification Report:")
print(classification_report(y_test, best_predictions, target_names=iris.target_names))

# Confusion Matrix
cm = confusion_matrix(y_test, best_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Cross-validation scores
print("\nCross-Validation Scores (5-fold):")
for name, clf in classifiers.items():
    cv_scores = cross_val_score(clf, X_iris, y_iris, cv=5)
    print(f"{name}: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

## 3. Regression Example with Model Training

Build and evaluate regression models using train-test split methodology.

In [None]:
# Create a regression dataset
X_reg, y_reg = make_regression(n_samples=500, n_features=10, n_informative=8, 
                              noise=0.1, random_state=42)

print("Regression Dataset Information:")
print(f"Features shape: {X_reg.shape}")
print(f"Target shape: {y_reg.shape}")
print(f"Target mean: {y_reg.mean():.2f}")
print(f"Target std: {y_reg.std():.2f}")
print()

# Split the data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42)

print(f"Training set: {X_train_reg.shape[0]} samples")
print(f"Test set: {X_test_reg.shape[0]} samples")
print()

# Train regression models
regressors = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

reg_results = {}

for name, reg in regressors.items():
    # Train the model
    reg.fit(X_train_reg, y_train_reg)
    
    # Make predictions
    y_pred_reg = reg.predict(X_test_reg)
    
    # Calculate metrics
    mse = mean_squared_error(y_test_reg, y_pred_reg)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_reg, y_pred_reg)
    
    # Store results
    reg_results[name] = {
        'model': reg,
        'mse': mse,
        'rmse': rmse,
        'r2': r2,
        'predictions': y_pred_reg
    }
    
    print(f"{name}:")
    print(f"  MSE: {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R² Score: {r2:.4f}")
    print()

# Visualize predictions vs actual values
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

for i, (name, results) in enumerate(reg_results.items()):
    y_pred = results['predictions']
    r2 = results['r2']
    
    axes[i].scatter(y_test_reg, y_pred, alpha=0.6)
    axes[i].plot([y_test_reg.min(), y_test_reg.max()], 
                 [y_test_reg.min(), y_test_reg.max()], 'r--', linewidth=2)
    axes[i].set_xlabel('Actual Values')
    axes[i].set_ylabel('Predicted Values')
    axes[i].set_title(f'{name}\nR² = {r2:.4f}')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Residual plots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

for i, (name, results) in enumerate(reg_results.items()):
    y_pred = results['predictions']
    residuals = y_test_reg - y_pred
    
    axes[i].scatter(y_pred, residuals, alpha=0.6)
    axes[i].axhline(y=0, color='r', linestyle='--')
    axes[i].set_xlabel('Predicted Values')
    axes[i].set_ylabel('Residuals')
    axes[i].set_title(f'{name} - Residual Plot')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Feature Scaling and Preprocessing

Demonstrate the importance of proper train-test split when applying preprocessing steps.

In [None]:
# Create a dataset with features of different scales
np.random.seed(42)
n_samples = 1000

# Features with different scales
feature1 = np.random.normal(100, 15, n_samples)  # Large scale
feature2 = np.random.normal(5, 2, n_samples)     # Medium scale  
feature3 = np.random.normal(0.01, 0.005, n_samples)  # Small scale

X_scale = np.column_stack([feature1, feature2, feature3])
y_scale = (feature1 * 0.5 + feature2 * 2 + feature3 * 1000 + 
           np.random.normal(0, 10, n_samples))

print("Original Feature Statistics:")
print(f"Feature 1 - Mean: {feature1.mean():.2f}, Std: {feature1.std():.2f}")
print(f"Feature 2 - Mean: {feature2.mean():.2f}, Std: {feature2.std():.2f}")
print(f"Feature 3 - Mean: {feature3.mean():.4f}, Std: {feature3.std():.4f}")
print()

# Split the data first
X_train_scale, X_test_scale, y_train_scale, y_test_scale = train_test_split(
    X_scale, y_scale, test_size=0.2, random_state=42)

# CORRECT WAY: Fit scaler on training data only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_scale)
X_test_scaled = scaler.transform(X_test_scale)  # Only transform, don't fit

print("After Scaling - Training Set Statistics:")
for i in range(X_train_scaled.shape[1]):
    mean = X_train_scaled[:, i].mean()
    std = X_train_scaled[:, i].std()
    print(f"Feature {i+1} - Mean: {mean:.4f}, Std: {std:.4f}")
print()

print("After Scaling - Test Set Statistics:")
for i in range(X_test_scaled.shape[1]):
    mean = X_test_scaled[:, i].mean()
    std = X_test_scaled[:, i].std()
    print(f"Feature {i+1} - Mean: {mean:.4f}, Std: {std:.4f}")
print()

# Compare models with and without scaling
# Without scaling
lr_no_scale = LinearRegression()
lr_no_scale.fit(X_train_scale, y_train_scale)
y_pred_no_scale = lr_no_scale.predict(X_test_scale)
r2_no_scale = r2_score(y_test_scale, y_pred_no_scale)

# With scaling
lr_scaled = LinearRegression()
lr_scaled.fit(X_train_scaled, y_train_scale)
y_pred_scaled = lr_scaled.predict(X_test_scaled)
r2_scaled = r2_score(y_test_scale, y_pred_scaled)

print("Model Performance Comparison:")
print(f"Without scaling - R² Score: {r2_no_scale:.4f}")
print(f"With scaling - R² Score: {r2_scaled:.4f}")
print()

# Visualize the effect of scaling
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Original features
for i in range(3):
    axes[0, i].hist(X_scale[:, i], bins=30, alpha=0.7, edgecolor='black')
    axes[0, i].set_title(f'Original Feature {i+1}')
    axes[0, i].set_ylabel('Frequency')

# Scaled features
X_all_scaled = scaler.fit_transform(X_scale)  # For visualization only
for i in range(3):
    axes[1, i].hist(X_all_scaled[:, i], bins=30, alpha=0.7, edgecolor='black')
    axes[1, i].set_title(f'Scaled Feature {i+1}')
    axes[1, i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Show coefficients comparison
print("Model Coefficients Comparison:")
print(f"Without scaling: {lr_no_scale.coef_}")
print(f"With scaling: {lr_scaled.coef_}")
print()
print("Note: Scaling makes coefficients more comparable and often improves model training.")

## 5. Hyperparameter Tuning with Proper Validation

Demonstrate hyperparameter tuning using GridSearchCV while maintaining proper train-test separation.

In [None]:
# Use the iris dataset for hyperparameter tuning
X_iris, y_iris = load_iris(return_X_y=True)

# Split into train and test sets (test set is held out completely)
X_train_hp, X_test_hp, y_train_hp, y_test_hp = train_test_split(
    X_iris, y_iris, test_size=0.2, stratify=y_iris, random_state=42)

print("Hyperparameter Tuning Demonstration")
print(f"Training set: {X_train_hp.shape[0]} samples")
print(f"Test set: {X_test_hp.shape[0]} samples")
print()

# Define hyperparameter grids for different models
param_grids = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7, None],
            'min_samples_split': [2, 5, 10]
        }
    },
    'SVM': {
        'model': SVC(random_state=42),
        'params': {
            'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf', 'poly'],
            'gamma': ['scale', 'auto']
        }
    }
}

best_models = {}

for model_name, config in param_grids.items():
    print(f"Tuning {model_name}...")
    
    # Perform grid search with cross-validation on training set only
    grid_search = GridSearchCV(
        config['model'], 
        config['params'], 
        cv=5, 
        scoring='accuracy',
        n_jobs=-1,
        verbose=0
    )
    
    # Fit on training data only
    grid_search.fit(X_train_hp, y_train_hp)
    
    # Store the best model
    best_models[model_name] = grid_search.best_estimator_
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    # Evaluate on test set
    test_score = grid_search.best_estimator_.score(X_test_hp, y_test_hp)
    print(f"Test set score: {test_score:.4f}")
    print()

# Compare all models on the test set
print("Final Model Comparison on Test Set:")
test_scores = {}
for model_name, model in best_models.items():
    test_score = model.score(X_test_hp, y_test_hp)
    test_scores[model_name] = test_score
    print(f"{model_name}: {test_score:.4f}")

# Visualize hyperparameter tuning results
plt.figure(figsize=(10, 6))
models = list(test_scores.keys())
scores = list(test_scores.values())

bars = plt.bar(models, scores, color=['skyblue', 'lightcoral'], edgecolor='black')
plt.title('Model Performance After Hyperparameter Tuning', fontsize=16, fontweight='bold')
plt.ylabel('Test Accuracy', fontsize=14)
plt.ylim(0.8, 1.0)

# Add value labels on bars
for bar, score in zip(bars, scores):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
             f'{score:.4f}', ha='center', va='bottom', fontweight='bold')

plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# Feature importance for the best Random Forest model
if 'Random Forest' in best_models:
    rf_model = best_models['Random Forest']
    feature_names = load_iris().feature_names
    
    plt.figure(figsize=(10, 6))
    feature_importance = rf_model.feature_importances_
    sorted_idx = np.argsort(feature_importance)[::-1]
    
    plt.bar(range(len(feature_importance)), feature_importance[sorted_idx], 
            color='lightgreen', edgecolor='black')
    plt.xticks(range(len(feature_importance)), 
               [feature_names[i] for i in sorted_idx], rotation=45)
    plt.title('Feature Importance - Best Random Forest Model', fontsize=16, fontweight='bold')
    plt.ylabel('Importance', fontsize=14)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

print("\nKey Takeaways:")
print("1. Always split your data BEFORE any preprocessing or hyperparameter tuning")
print("2. Use cross-validation on the training set for hyperparameter tuning")
print("3. Keep the test set completely separate until final evaluation")
print("4. The test set provides an unbiased estimate of model performance")