# Scikit-Learn Practice Questions

This notebook covers essential scikit-learn operations with simple examples including train-test split, model training, and evaluation.

In [None]:
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, r2_score
from sklearn.datasets import load_iris, make_regression

print("All libraries imported successfully!")
print(f"Scikit-learn version: {__import__('sklearn').__version__}")

## 1. Train-Test Split Basic Example

Learn how to split data into training and testing sets.

In [None]:
# Load the iris dataset
iris = load_iris()
X, y = iris.data, iris.target

print("Dataset Information:")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Classes: {iris.target_names}")
print()

# Basic train-test split (70-30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Train-Test Split (70-30):")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print()

# Show the split visually
plt.figure(figsize=(8, 5))
labels = ['Training Set', 'Test Set']
sizes = [len(X_train), len(X_test)]
colors = ['lightblue', 'lightcoral']

plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Train-Test Split Distribution')
plt.axis('equal')
plt.show()

## 2. Classification Example

Build and train a simple classification model.

In [None]:
# Using the data from previous split
print("Training a Logistic Regression Model:")
print()

# Create and train the model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print()

# Show some predictions vs actual
print("Sample Predictions vs Actual:")
print("Predicted | Actual | Correct?")
print("-" * 30)
for i in range(10):
    correct = "✓" if y_pred[i] == y_test[i] else "✗"
    print(f"    {iris.target_names[y_pred[i]]:10} | {iris.target_names[y_test[i]]:6} | {correct}")

# Simple confusion matrix visualization
cm = confusion_matrix(y_test, y_pred)
print()
print("Confusion Matrix:")
print(cm)

# Plot confusion matrix using matplotlib only
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap='Blues')
plt.title('Confusion Matrix')
plt.colorbar()

# Add text annotations
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha='center', va='center', fontsize=16)

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(range(len(iris.target_names)), iris.target_names)
plt.yticks(range(len(iris.target_names)), iris.target_names)
plt.tight_layout()
plt.show()

## 3. Regression Example

Build and train a simple regression model.

In [None]:
# Create a simple regression dataset
X_reg, y_reg = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)

print("Regression Dataset Information:")
print(f"Number of samples: {X_reg.shape[0]}")
print(f"Number of features: {X_reg.shape[1]}")
print()

# Split the data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.3, random_state=42)

print(f"Training set: {X_train_reg.shape[0]} samples")
print(f"Test set: {X_test_reg.shape[0]} samples")
print()

# Train a Linear Regression model
model_reg = LinearRegression()
model_reg.fit(X_train_reg, y_train_reg)

# Make predictions
y_pred_reg = model_reg.predict(X_test_reg)

# Calculate metrics
mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print("Model Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.4f}")
print()

# Visualize the results
plt.figure(figsize=(12, 5))

# Plot 1: Training data and regression line
plt.subplot(1, 2, 1)
plt.scatter(X_train_reg, y_train_reg, alpha=0.6, label='Training Data')
plt.plot(X_train_reg, model_reg.predict(X_train_reg), 'r-', label='Regression Line')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.title('Training Data with Regression Line')
plt.legend()

# Plot 2: Predictions vs Actual
plt.subplot(1, 2, 2)
plt.scatter(y_test_reg, y_pred_reg, alpha=0.6)
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', linewidth=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title(f'Predictions vs Actual (R² = {r2:.3f})')

plt.tight_layout()
plt.show()

## 4. Model Comparison

Compare different models and see which performs better.

In [None]:
# Using the iris dataset for comparison
X, y = load_iris(return_X_y=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Different models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42)
}

# Train and evaluate each model
model_scores = {}

print("Model Comparison Results:")
print("-" * 40)

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    model_scores[name] = accuracy
    
    print(f"{name}: {accuracy:.4f}")

print()

# Find the best model
best_model_name = max(model_scores.keys(), key=lambda k: model_scores[k])
print(f"Best Model: {best_model_name} with accuracy: {model_scores[best_model_name]:.4f}")

# Visualize model comparison
plt.figure(figsize=(10, 6))
model_names = list(model_scores.keys())
accuracies = list(model_scores.values())

bars = plt.bar(model_names, accuracies, color=['skyblue', 'lightgreen'], edgecolor='black')
plt.title('Model Comparison - Accuracy Scores', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for bar, accuracy in zip(bars, accuracies):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{accuracy:.4f}', ha='center', va='bottom', fontweight='bold')

plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Making Predictions on New Data

Learn how to use a trained model to make predictions on new, unseen data.

In [None]:
# Train a model on the full iris dataset
X, y = load_iris(return_X_y=True)
iris = load_iris()

# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X_train, y_train)

print("Model trained successfully!")
print(f"Training accuracy: {final_model.score(X_train, y_train):.4f}")
print(f"Test accuracy: {final_model.score(X_test, y_test):.4f}")
print()

# Create some new sample data for prediction
new_samples = np.array([
    [5.1, 3.5, 1.4, 0.2],  # Looks like Setosa
    [6.2, 2.8, 4.8, 1.8],  # Looks like Versicolor  
    [7.2, 3.0, 5.8, 2.0]   # Looks like Virginica
])

print("Making predictions on new data:")
print("Features: [Sepal Length, Sepal Width, Petal Length, Petal Width]")
print()

# Make predictions
predictions = final_model.predict(new_samples)
prediction_probabilities = final_model.predict_proba(new_samples)

for i, (sample, pred, probs) in enumerate(zip(new_samples, predictions, prediction_probabilities)):
    print(f"Sample {i+1}: {sample}")
    print(f"Predicted class: {iris.target_names[pred]}")
    print("Prediction probabilities:")
    for j, class_name in enumerate(iris.target_names):
        print(f"  {class_name}: {probs[j]:.3f}")
    print("-" * 40)

# Visualize predictions
plt.figure(figsize=(10, 6))
x_pos = range(len(new_samples))
width = 0.25

for i, class_name in enumerate(iris.target_names):
    probs = prediction_probabilities[:, i]
    plt.bar([x + i*width for x in x_pos], probs, width, label=class_name, alpha=0.7)

plt.xlabel('Sample Number')
plt.ylabel('Prediction Probability')
plt.title('Prediction Probabilities for New Samples')
plt.xticks([x + width for x in x_pos], [f'Sample {i+1}' for i in range(len(new_samples))])
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nKey Takeaways:")
print("1. Always split your data before training")
print("2. Train on training set, evaluate on test set")
print("3. Use the trained model to predict new data")
print("4. Check prediction probabilities for confidence")