In [1]:
import pandas as pd
import sqlite3

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import joblib
from sklearn.preprocessing import StandardScaler


In [3]:
from google.colab import drive
drive.mount('/content/drive')

DB_PATH = "/content/drive/MyDrive/ML_Project/university.db"
# If using VS Code locally, use:
# DB_PATH = "university.db"

conn = sqlite3.connect(DB_PATH)

df = pd.read_sql("SELECT * FROM students", conn)

df.head()

Mounted at /content/drive


Unnamed: 0,attendance,internal_score,assignment_completion,study_hours,previous_gpa,participation,missed_submissions,risk_level,created_at
0,70.29,83.9,85.4,23.2,4.56,Low,2,Medium,2026-02-08 04:50:10.988246
1,95.55,64.3,83.64,9.2,5.45,High,2,Low,2026-02-08 04:50:10.988246
2,61.56,76.93,84.79,16.6,9.11,Low,3,Low,2026-02-08 04:50:10.988246
3,79.34,79.9,100.0,16.1,10.0,High,4,Low,2026-02-08 04:50:10.988246
4,94.1,63.9,82.62,14.8,5.9,Low,1,Low,2026-02-08 04:50:10.988246


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   attendance             250 non-null    float64
 1   internal_score         250 non-null    float64
 2   assignment_completion  250 non-null    float64
 3   study_hours            250 non-null    float64
 4   previous_gpa           250 non-null    float64
 5   participation          250 non-null    object 
 6   missed_submissions     250 non-null    int64  
 7   risk_level             250 non-null    object 
 8   created_at             250 non-null    object 
dtypes: float64(5), int64(1), object(3)
memory usage: 17.7+ KB


In [5]:
df.isnull().sum()


Unnamed: 0,0
attendance,0
internal_score,0
assignment_completion,0
study_hours,0
previous_gpa,0
participation,0
missed_submissions,0
risk_level,0
created_at,0


In [6]:
df = df.drop(columns=["created_at"])


In [7]:
le_participation = LabelEncoder()
df["participation"] = le_participation.fit_transform(df["participation"])

le_risk = LabelEncoder()
df["risk_level"] = le_risk.fit_transform(df["risk_level"])


In [8]:
X = df.drop("risk_level", axis=1)
y = df["risk_level"]


In [9]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    random_state=42
)


In [10]:
lr = LogisticRegression(max_iter=5000)

lr.fit(X_train, y_train)

pred_lr = lr.predict(X_test)

lr_acc = accuracy_score(y_test, pred_lr)

print("Logistic Regression Accuracy:", lr_acc)


Logistic Regression Accuracy: 1.0


In [11]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

pred_dt = dt.predict(X_test)

dt_acc = accuracy_score(y_test, pred_dt)

print("Decision Tree Accuracy:", dt_acc)


Decision Tree Accuracy: 0.88


In [12]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)

rf_acc = accuracy_score(y_test, pred_rf)

print("Random Forest Accuracy:", rf_acc)


Random Forest Accuracy: 0.94


In [13]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Decision Tree", "Random Forest"],
    "Accuracy": [lr_acc, dt_acc, rf_acc]
})

results.sort_values(by="Accuracy", ascending=False)


Unnamed: 0,Model,Accuracy
0,Logistic Regression,1.0
2,Random Forest,0.94
1,Decision Tree,0.88


In [14]:
import pandas as pd
import sqlite3
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

DB_PATH = "/content/drive/MyDrive/ML_Project/university.db"
MODEL_PATH = "risk_model.pkl"

# Loading latest data
conn = sqlite3.connect(DB_PATH)
df = pd.read_sql("SELECT * FROM students", conn)

# Droping timestamp
df = df.drop(columns=["created_at"])

# Encoding categorical columns
le_participation = LabelEncoder()
df["participation"] = le_participation.fit_transform(df["participation"])

le_risk = LabelEncoder()
df["risk_level"] = le_risk.fit_transform(df["risk_level"])

# Features & Target
X = df.drop("risk_level", axis=1)
y = df["risk_level"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Training model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluating
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)

print("Model Accuracy:", accuracy)
print(classification_report(y_test, predictions))

# Saving model
joblib.dump(model, MODEL_PATH)

print("✅ Model trained and saved successfully!")

Model Accuracy: 0.94
              precision    recall  f1-score   support

           1       0.97      0.94      0.95        33
           2       0.89      0.94      0.91        17

    accuracy                           0.94        50
   macro avg       0.93      0.94      0.93        50
weighted avg       0.94      0.94      0.94        50

✅ Model trained and saved successfully!


In [15]:
import pandas as pd

importance = model.feature_importances_

features = X.columns

feat_imp = pd.DataFrame({
    "Feature": features,
    "Importance": importance
}).sort_values(by="Importance", ascending=False)

print(feat_imp)


                 Feature  Importance
4           previous_gpa    0.515748
0             attendance    0.155965
1         internal_score    0.122614
2  assignment_completion    0.078288
3            study_hours    0.064792
6     missed_submissions    0.042648
5          participation    0.019944


In [16]:
import pandas as pd
import sqlite3
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

DB_PATH = "/content/drive/MyDrive/ML_Project/university.db"
MODEL_PATH = "/content/drive/MyDrive/ML_Project/risk_model.pkl"
ENCODER_PATH = "/content/drive/MyDrive/ML_Project/participation_encoder.pkl"


print("Starting model retraining...")

# Loading Latest Data

conn = sqlite3.connect(DB_PATH)
df = pd.read_sql("SELECT * FROM students", conn)

print(f"Training on {len(df)} records.")

# Droping timestamp
df = df.drop(columns=["created_at"])

# Encode Categorical Features

le_participation = LabelEncoder()
df["participation"] = le_participation.fit_transform(df["participation"])

le_risk = LabelEncoder()
df["risk_level"] = le_risk.fit_transform(df["risk_level"])

# Saving encoder for prediction consistency
joblib.dump(le_participation, ENCODER_PATH)

# Features & Target

X = df.drop("risk_level", axis=1)
y = df["risk_level"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train NEW Model

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

model.fit(X_train, y_train)

# Evaluating

predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)

print(f"New Model Accuracy: {accuracy:.2f}")

# Replacing Old Model

joblib.dump(model, MODEL_PATH)

print("Model retrained and replaced successfully!")

Starting model retraining...
Training on 250 records.
New Model Accuracy: 0.92
Model retrained and replaced successfully!


In [39]:
!ls /content/drive/MyDrive/ML_Project


participation_encoder.pkl  risk_model.pkl  university.db


In [37]:
import joblib
import pandas as pd

MODEL_PATH = "risk_model.pkl"
ENCODER_PATH = "participation_encoder.pkl"

# Load model and encoder
model = joblib.load(MODEL_PATH)
encoder = joblib.load(ENCODER_PATH)

print("Model loaded successfully!")

# Example new student
new_student = pd.DataFrame([{
    "attendance": 68,
    "internal_score": 72,
    "assignment_completion": 75,
    "study_hours": 12,
    "previous_gpa": 6.8,
    "participation": "Medium",
    "missed_submissions": 2
}])

# Encode participation
new_student["participation"] = encoder.transform(
    new_student["participation"]
)

# Predict
prediction = model.predict(new_student)[0]
probability = model.predict_proba(new_student).max()

risk_map = {
    0:"High Risk",
    1:"Low Risk",
    2:"Medium Risk"
}

print("\nPrediction Complete!")
print("Predicted Risk Level:", risk_map[prediction])
print(f"Confidence: {probability*100:.2f}%")


Model loaded successfully!

Prediction Complete!
Predicted Risk Level: Low Risk
Confidence: 83.00%
