# What is supervised learning?
    Supervised learning is traning the data using labeled data(input + correct answer). The model learns from these examples to predict the outcome of the new data. It is like a teacher showing the correct answer during practice.

# What is classification?
    Classification is a type of supervised learning where the data is sorted into different categories. For example, deciding the incoming email is a spam mail or not. The output of this classification is always a label or class.

# What is regression?
    Regression is also a type of supervised learning that is used to predict numbers. For example, predicting house pricing based on size, neighbourhood and location. Unlike classification, the output is a continous value, not a category.

In [9]:
# Compare k-NN classifier performance using different distance metrics (Euclidean, Manhattan, Minkowski) on the Iris dataset
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Load the Iris dataset (features in X, target labels in y)
iris = load_iris()
X, y = iris.data, iris.target

# Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create k-NN classifiers with different distance metrics
knn_euclidean = KNeighborsClassifier(n_neighbors=3, metric='euclidean')   # Euclidean distance
knn_manhattan = KNeighborsClassifier(n_neighbors=3, metric='manhattan')   # Manhattan (L1) distance
knn_minkowski = KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3)  # Minkowski distance with p=3

# Train (fit) the classifiers on the training data
knn_euclidean.fit(X_train, y_train)
knn_manhattan.fit(X_train, y_train)
knn_minkowski.fit(X_train, y_train)

# Predict the labels of the test set using each classifier
y_pred_euclidean = knn_euclidean.predict(X_test)
y_pred_manhattan = knn_manhattan.predict(X_test)
y_pred_minkowski = knn_minkowski.predict(X_test)

# Evaluate accuracy of each model (percentage of correct predictions)
accuracy_euclidean = accuracy_score(y_test, y_pred_euclidean)
accuracy_manhattan = accuracy_score(y_test, y_pred_manhattan)
accuracy_minkowski = accuracy_score(y_test, y_pred_minkowski)

# Print the accuracy results for comparison
print("Accuracy (Euclidean):", accuracy_euclidean)
print("Accuracy (Manhattan):", accuracy_manhattan)
print("Accuracy (Minkowski):", accuracy_minkowski)

Accuracy (Euclidean): 1.0
Accuracy (Manhattan): 1.0
Accuracy (Minkowski): 1.0


In [10]:
# Compare Decision Tree and Random Forest classifiers on the Iris dataset and evaluate accuracy + feature importance

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree classifier
dt = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=42)
dt.fit(X_train, y_train)

# Create a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=4, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)

# Calculate the accuracy scores
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print("Accuracy (Decision Tree):", accuracy_dt)
print("Accuracy (Random Forest):", accuracy_rf)

# Get feature importances from the Random Forest
importances = rf.feature_importances_
print("Feature Importances:", importances)


Accuracy (Decision Tree): 1.0
Accuracy (Random Forest): 1.0
Feature Importances: [0.10509878 0.0212238  0.45005788 0.42361954]


In [11]:
# Linear Regression and Logistic Regression examples using different datasets

from sklearn.datasets import fetch_california_housing, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler

# Linear Regression example using California Housing dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (Linear Regression):", mse)

# Logistic Regression example with scaling
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

logistic = LogisticRegression(max_iter=2000)  # Increase max_iter
logistic.fit(X_train, y_train)

y_pred = logistic.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (Logistic Regression):", accuracy)


Mean Squared Error (Linear Regression): 0.555891598695242
Accuracy (Logistic Regression): 0.9736842105263158


In [12]:
# Evaluate various classification and regression metrics using appropriate datasets

from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Classification metrics example
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features for logistic regression
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

logistic = LogisticRegression(max_iter=2000)
logistic.fit(X_train, y_train)
y_pred = logistic.predict(X_test)
y_prob = logistic.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
cm = confusion_matrix(y_test, y_pred)
auc = roc_auc_score(y_test, logistic.predict_proba(X_test), multi_class='ovr')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Confusion Matrix:\n", cm)
print("ROC AUC Score:", auc)

# Regression metrics example using California Housing dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared:", r2)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
ROC AUC Score: 1.0
Mean Squared Error (MSE): 0.555891598695242
Root Mean Squared Error (RMSE): 0.7455813830127748
Mean Absolute Error (MAE): 0.533200130495698
R-squared: 0.5757877060324526
