In [1]:
import pandas as pd
import numpy as np

# List of university programs extracted from the PDF document
programs = [
    "Geography and Environmental Science", "Statistics and Operations Research",
    "Computer Sciences", "Soil and Plant Sciences", "Livestock, Wildlife and Fisheries",
    "Development Studies", "English & Applied Communication", "History",
    "Media and Cultural Studies", "Religious Studies & Philosophy", "Religious Studies",
    "XiChangana", "TshiVenda", "Archaeology, Museums and Heritage Studies", "Musicology and Ethnochoreology",
    "Accounting", "Banking and Finance", "Business Management", "Economics",
    "Economics and Finance", "Finance", "Fiscal Studies", "Hospitality Tourism and Culture",
    "Information Systems", "Internal Auditing", "Marketing Management", "Office Management",
    "Risk Management and Insurance", "Transport and Logistics", "Gender and Social Anthropology",
    "Human Resources Management", "Local Governance Studies", "Peace, Conflict and Governance",
    "Politics and Public Administration", "Psychology", "Sociology", "Urban Planning and Development"
]

def assign_churn(row):
    """Assigns churn status based on various risk factors."""
    high_risk_conditions = (
        (row['UniversityGPA'] < 2.0 and row['TuitionStatus'] == 'Unpaid') or
        (row['CourseFailures'] >= 3 and not row['FinancialAid'] and not row['Scholarship']) or
        (row['DistanceFromHome'] > 300 and row['PartTimeFullTime'] == 'Part-time')
    )
    medium_risk_conditions = (
        (2.0 <= row['UniversityGPA'] < 2.5) or
        row['AcademicProbation'] or
        row['EmploymentStatus'] == 'Employed'
    )

    if high_risk_conditions:
        return np.random.choice([True, False], p=[0.8, 0.2])
    elif medium_risk_conditions:
        return np.random.choice([True, False], p=[0.5, 0.5])
    else:
        return np.random.choice([True, False], p=[0.1, 0.9])

def generate_dataset(num_records):
    """Generates a simulated dataset with specified number of records."""
    data = {
        'StudentID': range(1, num_records + 1),
        'Age': np.random.randint(18, 25, size=num_records),
        'Gender': np.random.choice(['F', 'M', 'Other'], size=num_records),
        'MaritalStatus': np.random.choice(['Single', 'Married', 'Divorced'], size=num_records),
        'HighSchoolGPA': np.random.uniform(80, 100, size=num_records),
        'EntranceScore': np.random.uniform(800, 900, size=num_records),
        'StudyProgram': np.random.choice(programs, size=num_records),
        'YearOfStudy': np.random.choice([1, 2, 3, 4], size=num_records),
        'Modeofstudy': np.random.choice(['Block', 'Conventional'], size=num_records),
        'PartTimeFullTime': np.random.choice(['Full-time', 'Part-time'], size=num_records),
        'Scholarship': np.random.choice([True, False], size=num_records),
        'FinancialAid': np.random.choice([True, False], size=num_records),
        'TuitionStatus': np.random.choice(['Paid', 'Unpaid', 'Exempt'], size=num_records),
        'UniversityGPA': np.random.uniform(0, 4, size=num_records),
        'CourseFailures': np.random.randint(0, 10, size=num_records),
        'AcademicProbation': np.random.choice([True, False], size=num_records),
        'LibraryUses': np.random.randint(0, 50, size=num_records),
        'DistanceFromHome': np.random.uniform(0, 500, size=num_records),
        'EmploymentStatus': np.random.choice(['Employed', 'Unemployed', 'Student'], size=num_records)
    }
    
    df = pd.DataFrame(data)
    df['Churn'] = df.apply(assign_churn, axis=1)
    return df

# Generate the dataset
num_records = 1000  # You can specify the number of records here
dataset = generate_dataset(num_records)

# Display the first few rows of the dataset
print(dataset.head())

# Optionally, save the dataset to a CSV file
dataset.to_csv("gzu_churn_student_data.csv", index=False)


   StudentID  Age Gender MaritalStatus  HighSchoolGPA  EntranceScore  \
0          1   24      F        Single      97.602854     829.177362   
1          2   23      F      Divorced      96.716063     870.813290   
2          3   21  Other      Divorced      80.209021     843.809602   
3          4   20      F        Single      90.363817     878.093012   
4          5   21  Other      Divorced      89.026974     813.460615   

              StudyProgram  YearOfStudy   Modeofstudy PartTimeFullTime  \
0                Economics            1  Conventional        Full-time   
1     Marketing Management            4         Block        Full-time   
2  Transport and Logistics            3         Block        Full-time   
3  Transport and Logistics            3  Conventional        Part-time   
4      Business Management            2         Block        Full-time   

   Scholarship  FinancialAid TuitionStatus  UniversityGPA  CourseFailures  \
0         True          True          Paid   

In [2]:
import pandas as pd

# Load the dataset
file_path = 'gzu_churn_student_data.csv'
df = pd.read_csv(file_path)

# Print the column names
print(df.columns)


Index(['StudentID', 'Age', 'Gender', 'MaritalStatus', 'HighSchoolGPA',
       'EntranceScore', 'StudyProgram', 'YearOfStudy', 'Modeofstudy',
       'PartTimeFullTime', 'Scholarship', 'FinancialAid', 'TuitionStatus',
       'UniversityGPA', 'CourseFailures', 'AcademicProbation', 'LibraryUses',
       'DistanceFromHome', 'EmploymentStatus', 'Churn'],
      dtype='object')


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

# Load the dataset
file_path = 'gzu_churn_student_data.csv'
df = pd.read_csv(file_path)

# Convert all column names to lowercase and use underscores consistently
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Preprocess the data
categorical_columns = ['gender', 'maritalstatus', 'studyprogram', 'modeofstudy', 'parttimefulltime', 'tuitionstatus', 'employmentstatus']
label_encoders = {}

# Fit LabelEncoders on all unique values for each categorical column
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Features and target variable
X = df.drop(columns=['studentid', 'churn'])
y = df['churn']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Export the model, scaler, and label encoders
model_file = 'student_churn_model.pkl'
scaler_file = 'scaler.pkl'
label_encoders_file = 'label_encoders.pkl'

with open(model_file, 'wb') as f:
    pickle.dump(model, f)

with open(scaler_file, 'wb') as f:
    pickle.dump(scaler, f)

with open(label_encoders_file, 'wb') as f:
    pickle.dump(label_encoders, f)

print(f"Model, scaler, and label encoders exported as '{model_file}', '{scaler_file}', and '{label_encoders_file}' respectively.")


Accuracy: 0.69
Confusion Matrix:
 [[41 41]
 [21 97]]
Classification Report:
               precision    recall  f1-score   support

       False       0.66      0.50      0.57        82
        True       0.70      0.82      0.76       118

    accuracy                           0.69       200
   macro avg       0.68      0.66      0.66       200
weighted avg       0.69      0.69      0.68       200

Model, scaler, and label encoders exported as 'student_churn_model.pkl', 'scaler.pkl', and 'label_encoders.pkl' respectively.
