<a href="https://colab.research.google.com/github/Okelo123/AI-model-student/blob/main/Student_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import random

# Seed for reproducibility
np.random.seed(42)

# Number of records
num_records = 1000


sample_names = [
    "Juma Brian", "Agnes Waweru", "Michael Johnson", "Emily Davis", "Chris Lagat",
    "Sarah Jackson", "David Ombuna", "Laura Kamau", "James Taylor", "Linda Atieno",
    "Robert Carlos", "Susan Thomas", "Daniel Jackson", "Nancy White", "Mark Harris",
    "Patricia Ndei", "Paul Lewis", "Barbara Carnilar", "George Tom", "Hillary Sang"
]

data = {
    "student_id": [f"S{str(i).zfill(4)}" for i in range(1, num_records + 1)],
    "student_name": [random.choice(sample_names) for _ in range(num_records)],
    "program_type": np.random.choice(["Science", "Arts", "Commerce"], size=num_records),
    "historical_enrollment": np.random.choice([0, 1], size=num_records),
    "gpa": np.round(np.random.uniform(2.0, 4.0, size=num_records), 2),
    "attendance_rate": np.round(np.random.uniform(50, 100, size=num_records), 1),
    "family_income": np.random.randint(20000, 100000, size=num_records),
    "age": np.random.randint(18, 30, size=num_records),
    "gender": np.random.choice(["Male", "Female"], size=num_records),
    "enrollment_status": np.random.choice([0, 1], size=num_records),
    "support_need": np.random.choice([0, 1], size=num_records)
}

df = pd.DataFrame(data)

df.to_csv("synthetic_student_data.csv", index=False)


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("synthetic_student_data.csv")

label_encoders = {}
for column in ["program_type", "gender"]:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

X = df.drop(columns=["enrollment_status", "support_need", "student_id", "student_name"])
y_enrollment = df["enrollment_status"]
y_support = df["support_need"]

X_train_enroll, X_test_enroll, y_train_enroll, y_test_enroll = train_test_split(X, y_enrollment, test_size=0.2, random_state=42)
X_train_support, X_test_support, y_train_support, y_test_support = train_test_split(X, y_support, test_size=0.2, random_state=42)


In [8]:

model_enroll = RandomForestClassifier(random_state=42)
model_support = RandomForestClassifier(random_state=42)

model_enroll.fit(X_train_enroll, y_train_enroll)
model_support.fit(X_train_support, y_train_support)


In [9]:

y_pred_enroll = model_enroll.predict(X_test_enroll)
enroll_accuracy = accuracy_score(y_test_enroll, y_pred_enroll)
print("Enrollment Prediction Accuracy:", enroll_accuracy)
print("Enrollment Prediction Report:\n", classification_report(y_test_enroll, y_pred_enroll))

y_pred_support = model_support.predict(X_test_support)
support_accuracy = accuracy_score(y_test_support, y_pred_support)
print("Support Need Prediction Accuracy:", support_accuracy)
print("Support Need Prediction Report:\n", classification_report(y_test_support, y_pred_support))


Enrollment Prediction Accuracy: 0.47
Enrollment Prediction Report:
               precision    recall  f1-score   support

           0       0.49      0.48      0.49       104
           1       0.45      0.46      0.45        96

    accuracy                           0.47       200
   macro avg       0.47      0.47      0.47       200
weighted avg       0.47      0.47      0.47       200

Support Need Prediction Accuracy: 0.445
Support Need Prediction Report:
               precision    recall  f1-score   support

           0       0.46      0.48      0.47       102
           1       0.43      0.41      0.42        98

    accuracy                           0.45       200
   macro avg       0.44      0.44      0.44       200
weighted avg       0.44      0.45      0.44       200



In [10]:

df['predicted_enrollment'] = model_enroll.predict(X)
df['predicted_support_need'] = model_support.predict(X)

students_needing_support = df[(df['predicted_enrollment'] == 1) & (df['predicted_support_need'] == 1)]
print("Students Likely to Enroll and Need Support:\n", students_needing_support[["student_id", "student_name", "gpa", "attendance_rate", "family_income", "age", "gender"]])


Students Likely to Enroll and Need Support:
     student_id    student_name   gpa  attendance_rate  family_income  age  \
3        S0004    Linda Atieno  3.08             91.9          45500   28   
7        S0008      Paul Lewis  3.08             75.6          51174   21   
10       S0011    David Ombuna  3.27             51.7          67406   22   
24       S0025    Susan Thomas  3.95             73.0          91832   21   
36       S0037  Daniel Jackson  2.55             64.6          76358   21   
..         ...             ...   ...              ...            ...  ...   
972      S0973  Daniel Jackson  2.94             63.4          45553   28   
973      S0974      Juma Brian  3.64             68.4          80410   27   
978      S0979    Agnes Waweru  2.67             81.5          93914   28   
982      S0983     Laura Kamau  2.20             72.1          99211   22   
996      S0997     Laura Kamau  3.35             88.2          99943   25   

     gender  
3         1  
7 