# KNN classification of Stress_Level
This notebook trains a K-Nearest Neighbors classifier to predict `Stress_Level` (Low / Moderate / High).
It evaluates the model (classification report + confusion matrix) and shows 2D PCA visualizations colored by true and predicted stress levels.

In [32]:
%pip install seaborn

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

# === Indlæs data ===
student_data = pd.read_csv("student_lifestyle_dataset.csv")

print('Loaded', student_data.shape[0], 'rows and', student_data.shape[1], 'columns')
student_data.head()

Note: you may need to restart the kernel to use updated packages.
Loaded 2000 rows and 8 columns


Unnamed: 0,Student_ID,Study_Hours_Per_Day,Extracurricular_Hours_Per_Day,Sleep_Hours_Per_Day,Social_Hours_Per_Day,Physical_Activity_Hours_Per_Day,GPA,Stress_Level
0,1,6.9,3.8,8.7,2.8,1.8,2.99,Moderate
1,2,5.3,3.5,8.0,4.2,3.0,2.75,Low
2,3,5.1,3.9,9.2,1.2,4.6,2.67,Low
3,4,6.5,2.1,7.2,1.7,6.5,2.88,Moderate
4,5,8.1,0.6,6.5,2.2,6.6,3.51,High


In [33]:
# Prepare features and target (predict Stress_Level: Low/Moderate/High)
df = student_data.copy()
# Drop identifier columns if present
df = df.drop(columns=['Student_ID'], errors='ignore')
X = df.drop(columns=['Stress_Level'], errors='ignore')
y = df['Stress_Level']

# Use numeric features for KNN (you could encode categoricals if desired)
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X_num = X[numeric_cols].copy()

print('Numeric features used:', numeric_cols)
y.value_counts()

Numeric features used: ['Study_Hours_Per_Day', 'Extracurricular_Hours_Per_Day', 'Sleep_Hours_Per_Day', 'Social_Hours_Per_Day', 'Physical_Activity_Hours_Per_Day', 'GPA']


Stress_Level
High        1029
Moderate     674
Low          297
Name: count, dtype: int64

In [34]:
# Encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)
print('Classes:', le.classes_)

# Train/test split (stratify to keep class balance)
X_train, X_test, y_train, y_test = train_test_split(X_num, y_enc, test_size=0.2, random_state=42, stratify=y_enc)

# Scale numeric features
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Grid search for best n_neighbors
param_grid = {'n_neighbors': [3,5,7,9,11], 'weights': ['uniform','distance']}
knn = KNeighborsClassifier()
gs = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
gs.fit(X_train_s, y_train)
print('Best params:', gs.best_params_, 'Best CV acc:', gs.best_score_)

Classes: ['High' 'Low' 'Moderate']
Best params: {'n_neighbors': 7, 'weights': 'distance'} Best CV acc: 0.903125
Best params: {'n_neighbors': 7, 'weights': 'distance'} Best CV acc: 0.903125


In [35]:
# Evaluation on test set
best = gs.best_estimator_
y_pred = best.predict(X_test_s)
print('Test accuracy:', accuracy_score(y_test, y_pred))
print('Classification report:')
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion matrix plot (saved to plots/knn)
os.makedirs('plots/knn', exist_ok=True)
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
fig, ax = plt.subplots(figsize=(6,6))
disp.plot(ax=ax, cmap='Blues')
ax.set_title('Confusion Matrix (test)')
fig.savefig('plots/knn/confusion_matrix.png')
plt.close(fig)
print('Saved confusion matrix to plots/knn/confusion_matrix.png')

Test accuracy: 0.9175
Classification report:
              precision    recall  f1-score   support

        High       0.94      0.99      0.97       206
         Low       0.84      0.80      0.82        59
    Moderate       0.91      0.86      0.88       135

    accuracy                           0.92       400
   macro avg       0.90      0.88      0.89       400
weighted avg       0.92      0.92      0.92       400

Saved confusion matrix to plots/knn/confusion_matrix.png
Saved confusion matrix to plots/knn/confusion_matrix.png


In [36]:
# PCA visualization (2D) of the full numeric dataset colored by true and predicted stress level
# Scale the full numeric matrix using the previously fitted scaler
X_all_s = scaler.transform(X_num)
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_all_s)
# Predict on full scaled data (to visualize model behavior across dataset)
y_all = le.transform(y)
y_pred_all = best.predict(X_all_s)

palette = {'Low': 'green', 'Moderate': 'yellow', 'High': 'red'}
# helper: get top contributors for each PC
explained = pca.explained_variance_ratio_

def top_features_for_pc(pc_index, n=3):
    comp = pca.components_[pc_index]
    inds = np.argsort(np.abs(comp))[::-1][:n]
    terms = [f"{numeric_cols[i]} ({comp[i]:+.2f})" for i in inds]
    return ', '.join(terms)

xlabel = f"PC1 ({explained[0]*100:.1f}% var) — top: {top_features_for_pc(0)}"
ylabel = f"PC2 ({explained[1]*100:.1f}% var) — top: {top_features_for_pc(1)}"
# true labels plot
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=y, palette=palette, alpha=0.6, edgecolor='k', ax=ax)
plt.title('PCA (2D) colored by TRUE Stress_Level')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
fig.savefig('plots/knn/pca_true.png', dpi=150)
plt.close(fig)
print('Saved PCA true labels to plots/knn/pca_true.png')

# predicted labels plot
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=le.inverse_transform(y_pred_all), palette=palette, alpha=0.6, edgecolor='k', ax=ax)
plt.title('PCA (2D) colored by PREDICTED Stress_Level')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
fig.savefig('plots/knn/pca_predicted.png', dpi=150)
plt.close(fig)
print('Saved PCA predicted labels to plots/knn/pca_predicted.png')

Saved PCA true labels to plots/knn/pca_true.png
Saved PCA predicted labels to plots/knn/pca_predicted.png
Saved PCA predicted labels to plots/knn/pca_predicted.png


In [37]:
# t-SNE visualization (2D)
from sklearn.manifold import TSNE

print('Computing t-SNE (this may take a moment)')
tsne = TSNE(n_components=2, random_state=42, init='pca', learning_rate='auto')
X_tsne = tsne.fit_transform(X_all_s)

# compute top correlated original features for each t-SNE axis
import pandas as _pd
X_scaled_df = _pd.DataFrame(X_all_s, columns=numeric_cols)
corrs_x = X_scaled_df.corrwith(_pd.Series(X_tsne[:,0]))
corrs_y = X_scaled_df.corrwith(_pd.Series(X_tsne[:,1]))

def top_corr_labels(corrs, n=3):
    top = corrs.abs().sort_values(ascending=False).head(n)
    parts = [f"{idx} ({corrs[idx]:+.2f})" for idx in top.index]
    return ', '.join(parts)

xlabel = f"tSNE-1 — top corr: {top_corr_labels(corrs_x)}"
ylabel = f"tSNE-2 — top corr: {top_corr_labels(corrs_y)}"

# plot true labels
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=y, palette=palette, alpha=0.7, edgecolor='k', ax=ax)
plt.title('t-SNE (2D) colored by TRUE Stress_Level')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
fig.savefig('plots/knn/tsne_true.png', dpi=150)
plt.close(fig)
print('Saved t-SNE true labels to plots/knn/tsne_true.png')

# plot predicted labels
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=le.inverse_transform(y_pred_all), palette=palette, alpha=0.7, edgecolor='k', ax=ax)
plt.title('t-SNE (2D) colored by PREDICTED Stress_Level')
plt.xlabel(xlabel)
plt.ylabel(ylabel)
fig.savefig('plots/knn/tsne_predicted.png', dpi=150)
plt.close(fig)
print('Saved t-SNE predicted labels to plots/knn/tsne_predicted.png')

Computing t-SNE (this may take a moment)
Saved t-SNE true labels to plots/knn/tsne_true.png
Saved t-SNE true labels to plots/knn/tsne_true.png
Saved t-SNE predicted labels to plots/knn/tsne_predicted.png
Saved t-SNE predicted labels to plots/knn/tsne_predicted.png


In [38]:
%pip install umap-learn -q

import types

try:
    import umap.umap_ as umap
    has_umap = True
except Exception as e:
    # If umap isn't available try to fall back to an Isomap shim for 2D embedding
    try:
        from sklearn.manifold import Isomap
        class _ShimUMAP:
            def __init__(self, n_components=2, random_state=None, **kwargs):
                self._iso = Isomap(n_components=n_components)
            def fit(self, X, y=None):
                self._iso.fit(X)
                return self
            def transform(self, X):
                return self._iso.transform(X)
            def fit_transform(self, X, y=None):
                return self._iso.fit_transform(X)
        umap = types.SimpleNamespace(UMAP=_ShimUMAP)
        has_umap = True
        print("UMAP not installed; using Isomap shim as fallback for 2D embedding.")
    except Exception as e2:
        print('UMAP not available (install umap-learn to enable). Skipping UMAP. Error:', e2)
        has_umap = False

if has_umap:
    reducer = umap.UMAP(n_components=2, random_state=42)
    X_umap = reducer.fit_transform(X_all_s)
    # compute top correlated original features for each UMAP axis
    import pandas as _pd
    X_scaled_df = _pd.DataFrame(X_all_s, columns=numeric_cols)
    corrs_u0 = X_scaled_df.corrwith(_pd.Series(X_umap[:,0]))
    corrs_u1 = X_scaled_df.corrwith(_pd.Series(X_umap[:,1]))
    def top_corr_labels_umap(corrs, n=3):
        top = corrs.abs().sort_values(ascending=False).head(n)
        parts = [f"{idx} ({corrs[idx]:+.2f})" for idx in top.index]
        return ', '.join(parts)
    xlabel = f"UMAP-1 — top corr: {top_corr_labels_umap(corrs_u0)}"
    ylabel = f"UMAP-2 — top corr: {top_corr_labels_umap(corrs_u1)}"

    fig, ax = plt.subplots(figsize=(8,6))
    sns.scatterplot(x=X_umap[:,0], y=X_umap[:,1], hue=y, palette=palette, alpha=0.7, edgecolor='k', ax=ax)
    plt.title('UMAP (2D) colored by TRUE Stress_Level')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    fig.savefig('plots/knn/umap_true.png', dpi=150)
    plt.close(fig)
    print('Saved UMAP true labels to plots/knn/umap_true.png')

    fig, ax = plt.subplots(figsize=(8,6))
    sns.scatterplot(x=X_umap[:,0], y=X_umap[:,1], hue=le.inverse_transform(y_pred_all), palette=palette, alpha=0.7, edgecolor='k', ax=ax)
    plt.title('UMAP (2D) colored by PREDICTED Stress_Level')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    fig.savefig('plots/knn/umap_predicted.png', dpi=150)
    plt.close(fig)
    print('Saved UMAP predicted labels to plots/knn/umap_predicted.png')

Note: you may need to restart the kernel to use updated packages.


  warn(


Saved UMAP true labels to plots/knn/umap_true.png
Saved UMAP predicted labels to plots/knn/umap_predicted.png
Saved UMAP predicted labels to plots/knn/umap_predicted.png


In [39]:
# 3D visualizations: raw 3-feature scatter, PCA (3D), t-SNE (3D), UMAP (3D if available)
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as mpatches

os.makedirs('plots/knn', exist_ok=True)

# choose three numeric features to plot (ensure they exist)
def choose_three_features(defaults=['Study_Hours_Per_Day','Sleep_Hours_Per_Day','Physical_Activity_Hours_Per_Day']):
    feats = [f for f in defaults if f in numeric_cols]
    if len(feats) >= 3:
        return feats[:3]
    # fallback: take first three numeric_cols
    return numeric_cols[:3]

three_feats = choose_three_features()
print('Three features used for raw 3D scatter:', three_feats)

palette = {'Low': 'green', 'Moderate': 'yellow', 'High': 'red'}

# 3D scatter of three raw features
f1, f2, f3 = three_feats
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
colors = [palette.get(v, 'gray') for v in y]
ax.scatter(X_num[f1], X_num[f2], X_num[f3], c=colors, s=30, depthshade=True, edgecolor='k', alpha=0.7)
ax.set_xlabel(f1)
ax.set_ylabel(f2)
ax.set_zlabel(f3)
patches = [mpatches.Patch(color=palette[k], label=k) for k in palette]
ax.legend(handles=patches, title='Stress_Level')
fig.savefig(f'plots/knn/3d_features_{f1}_{f2}_{f3}.png', dpi=150)
plt.close(fig)
print(f'Saved raw 3D features scatter to plots/knn/3d_features_{f1}_{f2}_{f3}.png')

# try plotly interactive 3D (optional)
try:
    import plotly.express as px
    df_plot = pd.DataFrame({f1: X_num[f1], f2: X_num[f2], f3: X_num[f3], 'Stress_Level': y})
    figp = px.scatter_3d(df_plot, x=f1, y=f2, z=f3, color='Stress_Level', color_discrete_map=palette, opacity=0.7)
    html_path = f'plots/knn/3d_features_{f1}_{f2}_{f3}.html'
    figp.write_html(html_path)
    print('Saved interactive 3D HTML to', html_path)
except Exception as e:
    print('Plotly not available or failed to create interactive 3D plot:', e)

# PCA 3D
pca3 = PCA(n_components=3, random_state=42)
X_pca3 = pca3.fit_transform(X_all_s)
expl3 = pca3.explained_variance_ratio_

def top_features_pc(pc_index, n=3):
    comp = pca3.components_[pc_index]
    inds = np.argsort(np.abs(comp))[::-1][:n]
    return ', '.join([f"{numeric_cols[i]} ({comp[i]:+.2f})" for i in inds])

xlabel = f"PC1 ({expl3[0]*100:.1f}% var) — top: {top_features_pc(0)}"
ylabel = f"PC2 ({expl3[1]*100:.1f}% var) — top: {top_features_pc(1)}"
zlabel = f"PC3 ({expl3[2]*100:.1f}% var) — top: {top_features_pc(2)}"

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pca3[:,0], X_pca3[:,1], X_pca3[:,2], c=[palette.get(val,'gray') for val in y], s=25, alpha=0.7, edgecolor='k')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_zlabel(zlabel)
ax.legend(handles=patches, title='Stress_Level')
fig.savefig('plots/knn/pca_3d_true.png', dpi=150)
plt.close(fig)
print('Saved PCA 3D (true) to plots/knn/pca_3d_true.png')

# t-SNE 3D
from sklearn.manifold import TSNE
print('Computing t-SNE 3D (may take a while)')
tsne3 = TSNE(n_components=3, random_state=42, init='pca', learning_rate='auto')
X_tsne3 = tsne3.fit_transform(X_all_s)

# axis labels via correlation heuristic
X_scaled_df = pd.DataFrame(X_all_s, columns=numeric_cols)
corrs_tsne0 = X_scaled_df.corrwith(pd.Series(X_tsne3[:,0]))
corrs_tsne1 = X_scaled_df.corrwith(pd.Series(X_tsne3[:,1]))
corrs_tsne2 = X_scaled_df.corrwith(pd.Series(X_tsne3[:,2]))

def top_corrs(corrs, n=3):
    top = corrs.abs().sort_values(ascending=False).head(n)
    return ', '.join([f"{idx} ({corrs[idx]:+.2f})" for idx in top.index])

xlabel = f"tSNE-1 — top corr: {top_corrs(corrs_tsne0)}"
ylabel = f"tSNE-2 — top corr: {top_corrs(corrs_tsne1)}"
zlabel = f"tSNE-3 — top corr: {top_corrs(corrs_tsne2)}"

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_tsne3[:,0], X_tsne3[:,1], X_tsne3[:,2], c=[palette.get(val,'gray') for val in y], s=25, alpha=0.7, edgecolor='k')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_zlabel(zlabel)
ax.legend(handles=patches, title='Stress_Level')
fig.savefig('plots/knn/tsne_3d_true.png', dpi=150)
plt.close(fig)
print('Saved t-SNE 3D (true) to plots/knn/tsne_3d_true.png')

# UMAP 3D (if available)
try:
    import umap.umap_ as umap
    reducer3 = umap.UMAP(n_components=3, random_state=42)
    X_umap3 = reducer3.fit_transform(X_all_s)
    corrs_u0 = X_scaled_df.corrwith(pd.Series(X_umap3[:,0]))
    corrs_u1 = X_scaled_df.corrwith(pd.Series(X_umap3[:,1]))
    corrs_u2 = X_scaled_df.corrwith(pd.Series(X_umap3[:,2]))
    xlabel = f"UMAP-1 — top corr: {top_corrs(corrs_u0)}"
    ylabel = f"UMAP-2 — top corr: {top_corrs(corrs_u1)}"
    zlabel = f"UMAP-3 — top corr: {top_corrs(corrs_u2)}"
    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(X_umap3[:,0], X_umap3[:,1], X_umap3[:,2], c=[palette.get(val,'gray') for val in y], s=25, alpha=0.7, edgecolor='k')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_zlabel(zlabel)
    ax.legend(handles=patches, title='Stress_Level')
    fig.savefig('plots/knn/umap_3d_true.png', dpi=150)
    plt.close(fig)
    print('Saved UMAP 3D (true) to plots/knn/umap_3d_true.png')
except Exception as e:
    print('UMAP 3D skipped (umap not available):', e)

Three features used for raw 3D scatter: ['Study_Hours_Per_Day', 'Sleep_Hours_Per_Day', 'Physical_Activity_Hours_Per_Day']
Saved raw 3D features scatter to plots/knn/3d_features_Study_Hours_Per_Day_Sleep_Hours_Per_Day_Physical_Activity_Hours_Per_Day.png
Plotly not available or failed to create interactive 3D plot: No module named 'plotly'
Saved raw 3D features scatter to plots/knn/3d_features_Study_Hours_Per_Day_Sleep_Hours_Per_Day_Physical_Activity_Hours_Per_Day.png
Plotly not available or failed to create interactive 3D plot: No module named 'plotly'
Saved PCA 3D (true) to plots/knn/pca_3d_true.png
Computing t-SNE 3D (may take a while)
Saved PCA 3D (true) to plots/knn/pca_3d_true.png
Computing t-SNE 3D (may take a while)
Saved t-SNE 3D (true) to plots/knn/tsne_3d_true.png
Saved t-SNE 3D (true) to plots/knn/tsne_3d_true.png


  warn(


Saved UMAP 3D (true) to plots/knn/umap_3d_true.png


## Next steps
- Tune features (include categorical encodings).
- Try other classifiers (RandomForest, SVM).
- Produce an interactive Plotly visualization for deeper inspection.