In [5]:
import pandas as pd

# Load the dataset
data_path = 'GZU_student_churn_data.csv'
gzu_data = pd.read_csv(data_path)

# Display the first few rows of the dataset and a summary of the data
gzu_data.head(), gzu_data.info(), gzu_data.describe()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          10000 non-null  int64  
 1   Age                10000 non-null  int64  
 2   Gender             10000 non-null  object 
 3   MaritalStatus      10000 non-null  object 
 4   HighSchoolGPA      10000 non-null  float64
 5   EntranceScore      10000 non-null  float64
 6   StudyProgram       10000 non-null  object 
 7   YearOfStudy        10000 non-null  int64  
 8   Modeofstudy        10000 non-null  object 
 9   PartTimeFullTime   10000 non-null  object 
 10  Scholarship        10000 non-null  bool   
 11  FinancialAid       10000 non-null  bool   
 12  TuitionStatus      10000 non-null  object 
 13  UniversityGPA      10000 non-null  float64
 14  CourseFailures     10000 non-null  int64  
 15  AcademicProbation  10000 non-null  bool   
 16  ClubParticipation  1000

(   StudentID  Age Gender MaritalStatus  HighSchoolGPA  EntranceScore  \
 0          1   24      F        Single          89.09         821.52   
 1          2   21      M       Married          79.17        1355.97   
 2          3   28      M        Single          71.50        1427.23   
 3          4   25      M       Married          74.54         814.42   
 4          5   22      M        Single          94.36        1276.02   
 
   StudyProgram  YearOfStudy   Modeofstudy PartTimeFullTime  ...  FinancialAid  \
 0  Engineering            2         Block        Full-time  ...          True   
 1     Sciences            4  Conventional        Part-time  ...          True   
 2  Engineering            2  Conventional        Part-time  ...         False   
 3     Business            5         Block        Part-time  ...         False   
 4     Business            2         Block        Full-time  ...         False   
 
    TuitionStatus UniversityGPA  CourseFailures  AcademicProbation

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Exclude 'Churn' from the one-hot encoding process and ensure it remains in the dataframe
categorical_features = ['Gender', 'MaritalStatus', 'StudyProgram', 'Modeofstudy', 'PartTimeFullTime', 
                        'TuitionStatus', 'ClubParticipation', 'EmploymentStatus']

# Re-apply OneHotEncoding without affecting the 'Churn' column
encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(), categorical_features)], remainder='passthrough')
X = gzu_data.drop('Churn', axis=1)
y = gzu_data['Churn']
X_encoded = encoder.fit_transform(X)

# Creating a DataFrame from the encoded data
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out())
X_encoded_df['Churn'] = y  # Add 'Churn' column back to the DataFrame

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded_df.drop('Churn', axis=1), y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Output the transformed datasets
X_train_scaled.shape, X_test_scaled.shape


((8000, 33), (2000, 33))

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
logistic_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred = logistic_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

accuracy, conf_matrix, class_report


(0.4895,
 array([[363, 618],
        [403, 616]]),
 '              precision    recall  f1-score   support\n\n           0       0.47      0.37      0.42       981\n           1       0.50      0.60      0.55      1019\n\n    accuracy                           0.49      2000\n   macro avg       0.49      0.49      0.48      2000\nweighted avg       0.49      0.49      0.48      2000\n')

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

# Load the dataset
file_path = 'gzu_churn_student_data.csv'
df = pd.read_csv(file_path)

# Preprocess the data
categorical_columns = ['Gender', 'MaritalStatus', 'StudyProgram', 'Modeofstudy', 'PartTimeFullTime', 'TuitionStatus', 'EmploymentStatus']
label_encoders = {}

# Fit LabelEncoders on all unique values for each categorical column
for col in categorical_columns:
    le = LabelEncoder()
    le.fit(df[col].astype(str))  # Ensure all values are strings
    df[col] = le.transform(df[col].astype(str))
    label_encoders[col] = le

# Features and target variable
X = df.drop(columns=['StudentID', 'Churn'])
y = df['Churn']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Export the model, scaler, and label encoders
model_file = 'student_churn_model.pkl'
scaler_file = 'scaler.pkl'
label_encoders_file = 'label_encoders.pkl'

with open(model_file, 'wb') as f:
    pickle.dump(model, f)

with open(scaler_file, 'wb') as f:
    pickle.dump(scaler, f)

with open(label_encoders_file, 'wb') as f:
    pickle.dump(label_encoders, f)

print(f"Model, scaler, and label encoders exported as '{model_file}', '{scaler_file}', and '{label_encoders_file}' respectively.")


Accuracy: 0.565
Confusion Matrix:
 [[37 61]
 [26 76]]
Classification Report:
               precision    recall  f1-score   support

       False       0.59      0.38      0.46        98
        True       0.55      0.75      0.64       102

    accuracy                           0.56       200
   macro avg       0.57      0.56      0.55       200
weighted avg       0.57      0.56      0.55       200

Model, scaler, and label encoders exported as 'student_churn_model.pkl', 'scaler.pkl', and 'label_encoders.pkl' respectively.


NameError: name 'preprocessor' is not defined