In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_multilabel_classification

In [25]:
# generate base data
import random
np.random.seed(42)
n_students= 10

percentage_attendance= np.random.randint(40, 100, n_students) # 40% to 100%
q1_avg_score= np.random.randint(40, 95, n_students)
q2_avg_score= np.random.randint(40, 92, n_students)
q3_avg_score= np.random.randint(40, 97, n_students)
q1_trend = random.choices(['Stable', 'Declining'], k=n_students, weights=[0.55, 0.45])
q2_trend = random.choices(['Stable', 'Declining'], k=n_students, weights=[0.50, 0.50])
q3_trend = random.choices(['Stable', 'Declining'], k=n_students, weights=[0.40, 0.60])
q1_Attempts_Used= np.random.randint(1, 4, n_students)
q2_Attempts_Used= np.random.randint(1, 5, n_students)
q3_Attempts_Used= np.random.randint(1, 6, n_students)
fee_paid= np.random.choice(['Yes', 'No'], n_students, [0.8, 0.2])
fee_due_days= np.where(fee_paid=='Yes', 0, np.random.randint(10, 60,n_students))

# Assign risk label
risk=[]
for att, q1_score,q2_score, q3_score, q1_trend,q2_trend, q3_trend,q1_attempt, q2_attempt, q3_attempt, paid in zip(percentage_attendance, q1_avg_score,q2_avg_score,q3_avg_score,q1_trend,q2_trend,q3_trend,q1_Attempts_Used,q2_Attempts_Used,q3_Attempts_Used, fee_paid ):
  if att < 50 or q1_score < 50 or q2_score<50 or q3_score<50 or q1_trend== 'Declining' or q2_trend== 'Declining' or q3_trend== 'Declining' or q1_attempt>=3 or q2_attempt>=4 or q3_attempt>=5 or paid=='No':
    risk.append(2) # 2 for high level risk
  elif att < 70 or q1_score < 60 or q2_score<60 or q3_score<60 or q1_attempt>=1 or q2_attempt>=1 or q3_attempt>=1:
    risk.append(1) # 1 for medium level risk
  else:
    risk.append(0) # 0 for low level risk

df= pd.DataFrame({
    "Attendance%": percentage_attendance,
    "q1_avg_score": q1_avg_score,
    "q2_avg_score": q2_avg_score,
    "q3_avg_score": q3_avg_score,
    'q1_trend': q1_trend,
    'q2_trend': q2_trend,
    'q3_trend': q3_trend,
    "q1_Attempts_Used": q1_Attempts_Used,
    "q2_Attempts_Used": q2_Attempts_Used,
    "q3_Attempts_Used": q3_Attempts_Used,
    "Fee_Paid": fee_paid,
    "Fee_Due_Days": fee_due_days,
    "Risk_Label":risk
})

df.head(50)



Unnamed: 0,Attendance%,q1_avg_score,q2_avg_score,q3_avg_score,q1_trend,q2_trend,q3_trend,q1_Attempts_Used,q2_Attempts_Used,q3_Attempts_Used,Fee_Paid,Fee_Due_Days,Risk_Label
0,78,62,41,83,Stable,Declining,Stable,3,4,4,No,35,2
1,91,50,63,64,Stable,Declining,Stable,1,1,4,No,53,2
2,68,50,83,88,Stable,Declining,Stable,3,4,4,No,43,2
3,54,63,69,66,Stable,Declining,Stable,1,2,4,No,19,2
4,82,92,77,81,Stable,Declining,Stable,3,2,5,No,45,2
5,47,75,41,67,Stable,Declining,Stable,3,2,3,No,23,2
6,60,79,60,55,Stable,Declining,Stable,1,1,1,No,40,2
7,78,63,72,54,Stable,Declining,Stable,1,2,4,Yes,0,2
8,97,42,51,86,Stable,Declining,Stable,3,1,2,Yes,0,2
9,58,61,61,90,Stable,Declining,Stable,2,2,4,No,17,2


In [26]:
y= df.Risk_Label
y.head()

Unnamed: 0,Risk_Label
0,2
1,2
2,2
3,2
4,2


In [27]:
X= df.drop(['Risk_Label'], axis=1)
X.head()

Unnamed: 0,Attendance%,q1_avg_score,q2_avg_score,q3_avg_score,q1_trend,q2_trend,q3_trend,q1_Attempts_Used,q2_Attempts_Used,q3_Attempts_Used,Fee_Paid,Fee_Due_Days
0,78,62,41,83,Stable,Declining,Stable,3,4,4,No,35
1,91,50,63,64,Stable,Declining,Stable,1,1,4,No,53
2,68,50,83,88,Stable,Declining,Stable,3,4,4,No,43
3,54,63,69,66,Stable,Declining,Stable,1,2,4,No,19
4,82,92,77,81,Stable,Declining,Stable,3,2,5,No,45


In [28]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()

X['q1_trend']= le.fit_transform(df['q1_trend']) ## 1 -> stable, 0 -> Declining
X['q2_trend']= le.fit_transform(df['q2_trend']) ## 1 -> stable, 0 -> Declining
X['q3_trend']= le.fit_transform(df['q3_trend']) ## 1 -> stable, 0 -> Declining
X['Fee_Paid']= le.fit_transform(df['Fee_Paid']) ## 1 --> No, 0 --> yes
X.head()

Unnamed: 0,Attendance%,q1_avg_score,q2_avg_score,q3_avg_score,q1_trend,q2_trend,q3_trend,q1_Attempts_Used,q2_Attempts_Used,q3_Attempts_Used,Fee_Paid,Fee_Due_Days
0,78,62,41,83,0,0,0,3,4,4,0,35
1,91,50,63,64,0,0,0,1,1,4,0,53
2,68,50,83,88,0,0,0,3,4,4,0,43
3,54,63,69,66,0,0,0,1,2,4,0,19
4,82,92,77,81,0,0,0,3,2,5,0,45


In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3)

In [30]:
from sklearn.ensemble import RandomForestClassifier

model= RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

In [32]:
model.score(X_test, y_test)

1.0

In [33]:
model.predict([X.iloc[0, :].values])



array([2])

In [34]:
import joblib

joblib.dump(model, 'Student_risk_model.pkl')

['Student_risk_model.pkl']

In [35]:
from google.colab import files
files.download('Student_risk_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>