# Scikit-Learn Practical Examples

Hands-on, step-by-step examples for classification, regression, and clustering using scikit-learn. Each section includes data prep, model training, evaluation, and brief interpretation.

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                            mean_squared_error, r2_score, silhouette_score)
import seaborn as sns
sns.set(style='whitegrid', context='notebook')


## 1) Classification: Logistic Regression (Iris dataset)

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(max_iter=200, random_state=42)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc:.3f}')
print('
Classification Report:
', classification_report(y_test, y_pred, target_names=iris.target_names))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.title('Confusion Matrix - Logistic Regression (Iris)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


Interpretation: High accuracy and a diagonal-dominant confusion matrix indicate good class separability on standardized features.

## 2) Regression: Linear Regression (California Housing)

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression

cal = fetch_california_housing()
X = cal.data
y = cal.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

reg = LinearRegression()
reg.fit(X_train_scaled, y_train)
y_pred = reg.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse:.3f}, R2: {r2:.3f}')

plt.figure()
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('Linear Regression Predictions vs True (California Housing)')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()


Interpretation: R^2 close to 1 indicates better fit; inspect scatter plot alignment with the diagonal.

## 3) Clustering: K-Means (Iris features)

In [None]:
from sklearn.cluster import KMeans
# Reuse iris X (unscaled for demonstration; often scaling helps)
kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)
clusters = kmeans.fit_predict(iris.data)

sil = silhouette_score(iris.data, clusters)
print(f'Silhouette Score: {sil:.3f}')

plt.figure()
plt.scatter(iris.data[:, 0], iris.data[:, 1], c=clusters, cmap='viridis', alpha=0.7)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', s=150, marker='X', label='Centroids')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.title('K-Means Clustering on Iris (first two features)')
plt.legend()
plt.show()


Interpretation: Silhouette > 0.5 suggests distinct clustering; visualize separability in 2D feature space.

## 4) Model Selection: GridSearchCV (SVM on Iris)

In [None]:
from sklearn.svm import SVC
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.01, 0.1],
    'kernel': ['rbf', 'linear']
}
svm = SVC()
grid = GridSearchCV(svm, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train_scaled, y_train)
print('Best params:', grid.best_params_)
best_model = grid.best_estimator_
print('CV best score:', grid.best_score_)
test_acc = accuracy_score(y_test, best_model.predict(X_test_scaled))
print('Test accuracy:', test_acc)


Notes: Use scaling for SVM; GridSearchCV explores hyperparameters and reports the best configuration.