In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load dataset
file_path = 'updated_student_data.csv'  # Replace with your actual file path
data = pd.read_csv(file_path)

# Display dataset info
print("\nDataset Info:")
data.info()

# Define categorical and numerical columns
categorical_cols = ["PreviousCoursework", "Gender", "Ethnicity", "ParentalEducation",
                    "FinancialAid", "ExtracurricularActivities", "WorkExperience",
                    "InstitutionType", "ProgramOfStudy", "SupportServices", "Engagement"]

numerical_cols = ["HighSchoolGPA", "SATScore", "ACTScore", "Age", "HouseholdIncome", "Attendance"]

# Encode categorical columns
encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col])

# Handle non-numeric values in numerical columns
for col in numerical_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')
    data[col].fillna(data[col].mean(), inplace=True)

# Drop rows with missing values in numerical columns
data.dropna(subset=numerical_cols, inplace=True)

# Normalize numerical columns
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Define features and target
X = data.drop(["StudentID", "Enrollment"], axis=1)
y = data["Enrollment"]

# Handle missing target values
non_missing_indices = y.notna()
X = X[non_missing_indices]
y = y[non_missing_indices]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
print("\nModel Training Accuracy:", model.score(X_train, y_train))
print("Model Testing Accuracy:", model.score(X_test, y_test))

# Generate predictions and evaluation metrics
y_pred = model.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Analyze students predicted as not enrolled
not_enrolled = X_test[y_pred == 0]
print("\nStudents Predicted as Not Enrolled (Need Support):")
print(not_enrolled)

# Save the trained model
joblib.dump(model, 'student_enrollment_model.pkl')
print("\nModel saved as 'student_enrollment_model.pkl'")



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   StudentID                  104 non-null    int64  
 1   HighSchoolGPA              104 non-null    float64
 2   SATScore                   104 non-null    int64  
 3   ACTScore                   104 non-null    int64  
 4   PreviousCoursework         104 non-null    object 
 5   Age                        104 non-null    int64  
 6   Gender                     104 non-null    object 
 7   Ethnicity                  104 non-null    object 
 8   ParentalEducation          104 non-null    object 
 9   HouseholdIncome            103 non-null    object 
 10  FinancialAid               102 non-null    object 
 11  ExtracurricularActivities  102 non-null    object 
 12  WorkExperience             102 non-null    object 
 13  InstitutionType            102 non-

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
