In [50]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_multilabel_classification

In [64]:
# generate base data
np.random.seed(42)
n_students= 400

attendance= np.random.randint(40, 100, n_students) # 40% to 100%
avg_score= np.random.randint(40, 95, n_students)
trend= np.random.choice(['Stable', 'Declining'], n_students, p=[0.7, 0.3])
failed_subjects= np.random.randint(0, 4, n_students)
fee_paid= np.random.choice(['Yes', 'No'], n_students, [0.8, 0.2])
fee_due_days= np.where(fee_paid=='Yes', 0, np.random.randint(10, 60,n_students))

# Assign risk label
risk=[]
for att, score, t, fail, paid in zip(attendance, avg_score, trend,failed_subjects, fee_paid ):
  if att < 50 or score < 50 or t== 'Declining' or fail>=3 or paid=='No':
    risk.append(2) # 2 for high level risk
  elif att <70 or score < 60 or fail >= 1:
    risk.append(1) # 1 for medium level risk
  else:
    risk.append(0) # 0 for low level risk

df= pd.DataFrame({
    "Attendance%": attendance,
    "Avg_score": avg_score,
    'AvgScore_Trend': trend,
    "Failed_Subjects": failed_subjects,
    "Fee_Paid": fee_paid,
    "Fee_Due_Days": fee_due_days,
    "Risk_Label":risk
})

df.head(10)



Unnamed: 0,Attendance%,Avg_score,AvgScore_Trend,Failed_Subjects,Fee_Paid,Fee_Due_Days,Risk_Label
0,78,46,Stable,0,No,47,2
1,91,42,Stable,0,Yes,0,2
2,68,56,Stable,1,Yes,0,1
3,54,72,Stable,3,No,11,2
4,82,87,Stable,3,Yes,0,2
5,47,51,Stable,2,No,17,2
6,60,90,Declining,1,Yes,0,2
7,78,61,Stable,0,No,22,2
8,97,94,Declining,1,No,26,2
9,58,61,Stable,2,No,17,2


In [65]:
y= df.Risk_Label
y.head()

Unnamed: 0,Risk_Label
0,2
1,2
2,1
3,2
4,2


In [97]:
X= df.drop(['Risk_Label'], axis=1)
X.head()

Unnamed: 0,Attendance%,Avg_score,AvgScore_Trend,Failed_Subjects,Fee_Paid,Fee_Due_Days
0,78,46,Stable,0,No,47
1,91,42,Stable,0,Yes,0
2,68,56,Stable,1,Yes,0
3,54,72,Stable,3,No,11
4,82,87,Stable,3,Yes,0


In [100]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()

X['AvgScore_Trend']= le.fit_transform(df['AvgScore_Trend']) ## 1 -> stable, 0 -> Declining
X['Fee_Paid']= le.fit_transform(df['Fee_Paid']) ## 1 --> No, 0 --> yes
X.head()

Unnamed: 0,Attendance%,Avg_score,AvgScore_Trend,Failed_Subjects,Fee_Paid,Fee_Due_Days
0,78,46,1,0,0,47
1,91,42,1,0,1,0
2,68,56,1,1,1,0
3,54,72,1,3,0,11
4,82,87,1,3,1,0


In [101]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3)

In [102]:
from sklearn.ensemble import RandomForestClassifier

model= RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

In [103]:
model.score(X_test, y_test)

0.9833333333333333

In [117]:
model.predict([X.iloc[0, :].values])



array([2])

In [121]:
import joblib

joblib.dump(model, 'Student_risk_model.pkl')

['Student_risk_model.pkl']

In [122]:
from google.colab import files
files.download('Student_risk_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>