In [1]:
import pandas as pd  # Import pandas for data manipulation and analysis
import numpy as np  # Import numpy for numerical operations
from sklearn.model_selection import train_test_split  # Import train_test_split for splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler, LabelEncoder  # Import StandardScaler for feature scaling and LabelEncoder for encoding target labels
from sklearn.linear_model import LogisticRegression  # Import LogisticRegression for logistic regression model
from sklearn.ensemble import RandomForestClassifier, StackingClassifier  # Import RandomForestClassifier for random forest model and StackingClassifier for model stacking
from sklearn.svm import SVC  # Import SVC for support vector classifier
from xgboost import XGBClassifier  # Import XGBClassifier for XGBoost model
from lightgbm import LGBMClassifier  # Import LGBMClassifier for LightGBM model
from catboost import CatBoostClassifier  # Import CatBoostClassifier for CatBoost model
from sklearn import metrics  # Import metrics from sklearn for model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score  # Import specific metrics for model evaluation
import warnings  # Import warnings to manage warnings
import category_encoders as ce  # Import category_encoders for encoding categorical features
import optuna
from sklearn.preprocessing import LabelEncoder
# Ignore warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

In [3]:
# Define the path to the CSV file containing the data
path = r'C:\Users\User\Desktop\Rashad\DATA\Classification with an Academic Success Dataset\train.csv'

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(path)

# Set option to display all columns
pd.set_option('display.max_columns', None)

# Display the DataFrame to view the loaded data
data

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.500000,0,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.600000,0,0,6,9,0,0.000000,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.591250,0,0,8,11,7,12.820000,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,76513,1,17,1,9254,1,1,121.0,1,19,1,7,5,116.5,1,0,0,1,0,1,18,0,0,6,9,6,10.666667,0,0,6,8,5,10.600000,0,13.9,-0.3,0.79,Graduate
76514,76514,1,1,6,9254,1,1,125.0,1,1,38,4,9,131.6,1,0,0,1,0,0,19,0,0,6,22,4,13.000000,0,0,6,9,6,13.875000,0,9.4,-0.8,-3.12,Graduate
76515,76515,5,17,1,9085,1,1,138.0,1,37,37,9,10,123.3,1,0,0,1,0,0,19,0,0,5,13,4,12.500000,2,0,5,8,5,11.400000,1,9.4,-0.8,-3.12,Enrolled
76516,76516,1,1,3,9070,1,1,136.0,1,38,37,5,9,124.8,1,0,0,1,0,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,7.6,2.6,0.32,Dropout


In [4]:
df = data.copy()

In [5]:
df['Target'].value_counts()

Target
Graduate    36282
Dropout     25296
Enrolled    14940
Name: count, dtype: int64

In [7]:
label_encoder = LabelEncoder()

In [8]:
df['Target'] = label_encoder.fit_transform(df['Target'])

In [14]:
x_cat = df.drop(columns=['Target'])
y_cat = df['Target']
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(x_cat, y_cat, test_size=0.3, random_state=42)

In [15]:
catboost_model_def = CatBoostClassifier(loss_function='MultiClass', eval_metric='MultiClass')

In [16]:
df

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,19,5,5,122.6,0,0,0,1,0,1,18,0,0,6,6,6,14.500000,0,0,6,7,6,12.428571,0,11.1,0.6,2.02,2
1,1,1,17,1,9238,1,1,125.0,1,19,19,9,9,119.8,1,0,0,1,0,0,18,0,0,6,8,4,11.600000,0,0,6,9,0,0.000000,0,11.1,0.6,2.02,0
2,2,1,17,2,9254,1,1,137.0,1,3,19,2,3,144.7,0,0,0,1,1,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,16.2,0.3,-0.92,0
3,3,1,1,3,9500,1,1,131.0,1,19,3,3,2,126.1,1,0,0,1,0,1,18,0,0,7,9,7,12.591250,0,0,8,11,7,12.820000,0,11.1,0.6,2.02,1
4,4,1,1,2,9500,1,1,132.0,1,19,37,4,9,120.1,1,0,0,1,0,0,18,0,0,7,12,6,12.933333,0,0,7,12,6,12.933333,0,7.6,2.6,0.32,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,76513,1,17,1,9254,1,1,121.0,1,19,1,7,5,116.5,1,0,0,1,0,1,18,0,0,6,9,6,10.666667,0,0,6,8,5,10.600000,0,13.9,-0.3,0.79,2
76514,76514,1,1,6,9254,1,1,125.0,1,1,38,4,9,131.6,1,0,0,1,0,0,19,0,0,6,22,4,13.000000,0,0,6,9,6,13.875000,0,9.4,-0.8,-3.12,2
76515,76515,5,17,1,9085,1,1,138.0,1,37,37,9,10,123.3,1,0,0,1,0,0,19,0,0,5,13,4,12.500000,2,0,5,8,5,11.400000,1,9.4,-0.8,-3.12,1
76516,76516,1,1,3,9070,1,1,136.0,1,38,37,5,9,124.8,1,0,0,1,0,0,18,0,0,6,0,0,0.000000,0,0,6,0,0,0.000000,0,7.6,2.6,0.32,0


In [17]:
catboost_model_def.fit(X_train_cat,y_train_cat)

Learning rate set to 0.096938
0:	learn: 0.9953531	total: 38.4ms	remaining: 38.4s
1:	learn: 0.9158340	total: 77.2ms	remaining: 38.5s
2:	learn: 0.8534611	total: 117ms	remaining: 38.8s
3:	learn: 0.8024053	total: 156ms	remaining: 38.8s
4:	learn: 0.7603761	total: 193ms	remaining: 38.4s
5:	learn: 0.7251388	total: 242ms	remaining: 40s
6:	learn: 0.6957691	total: 279ms	remaining: 39.6s
7:	learn: 0.6690688	total: 318ms	remaining: 39.5s
8:	learn: 0.6464827	total: 359ms	remaining: 39.6s
9:	learn: 0.6268845	total: 396ms	remaining: 39.2s
10:	learn: 0.6091918	total: 434ms	remaining: 39s
11:	learn: 0.5936450	total: 479ms	remaining: 39.4s
12:	learn: 0.5807264	total: 519ms	remaining: 39.4s
13:	learn: 0.5690738	total: 554ms	remaining: 39s
14:	learn: 0.5591110	total: 591ms	remaining: 38.8s
15:	learn: 0.5501663	total: 629ms	remaining: 38.7s
16:	learn: 0.5422479	total: 662ms	remaining: 38.3s
17:	learn: 0.5352654	total: 705ms	remaining: 38.5s
18:	learn: 0.5286374	total: 744ms	remaining: 38.4s
19:	learn: 0.52

<catboost.core.CatBoostClassifier at 0x1e5db510450>

In [24]:
# Make predictions
y_pred_cat = catboost_model_def.predict(X_test_cat)

In [49]:
# Ensure the arrays are 1-dimensional
y_test_cat = np.ravel(y_test_cat)
y_pred_cat = np.ravel(y_pred_cat)

# Create a DataFrame to compare actual and predicted values
comparison_df = pd.DataFrame({'Actual': y_test_cat, 'Predicted': y_pred_cat})

   Actual  Predicted
0       2          2
1       0          0
2       1          2
3       0          0
4       1          2


In [50]:
comparison_df.head(30)

Unnamed: 0,Actual,Predicted
0,2,2
1,0,0
2,1,2
3,0,0
4,1,2
5,0,0
6,2,2
7,1,1
8,2,2
9,1,1


In [51]:
dep_path = r'C:\Users\User\Desktop\Rashad\DATA\Classification with an Academic Success Dataset\test.csv'  # File path to the CSV file containing the dataset

# Load dataset from Excel file
dep_data = pd.read_csv(dep_path)  # Load the dataset into a pandas DataFrame

# Set option to display all columns
pd.set_option('display.max_columns', None)  # Set pandas option to display all columns of the DataFrame

# Display the loaded dataset
dep_data  # Print or display the DataFrame

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Admission grade,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,76518,1,1,1,9500,1,1,141.0,1,3,1,2,3,152.1,0,0,0,0,0,0,18,0,0,7,0,0,0.000000,0,0,8,0,0,0.000000,0,13.9,-0.3,0.79
1,76519,1,1,1,9238,1,1,128.0,1,1,19,4,5,116.5,0,0,0,1,0,0,19,0,0,6,7,6,14.857143,0,0,6,6,6,13.500000,0,11.1,0.6,2.02
2,76520,1,1,1,9238,1,1,118.0,1,1,19,4,9,114.2,0,0,0,1,0,1,18,0,0,6,11,6,12.000000,0,0,6,11,5,11.000000,0,15.5,2.8,-4.06
3,76521,1,44,1,9147,1,39,130.0,1,1,19,3,3,130.0,0,0,0,1,0,1,23,0,2,6,15,5,11.500000,0,3,8,14,5,11.000000,0,8.9,1.4,3.51
4,76522,1,39,1,9670,1,1,110.0,1,1,37,5,5,106.0,1,0,0,1,0,0,26,0,0,6,9,3,11.000000,0,0,6,9,4,10.666667,2,7.6,2.6,0.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51007,127525,1,1,2,171,1,1,128.0,1,38,37,7,10,124.7,1,0,0,1,0,0,19,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,15.5,2.8,-4.06
51008,127526,2,39,1,9119,1,19,133.1,1,19,37,9,9,140.0,0,0,1,0,1,0,33,0,0,5,6,0,0.000000,0,0,5,5,0,0.000000,0,9.4,-0.8,-3.12
51009,127527,1,1,1,171,1,1,127.0,1,1,1,4,10,120.4,0,0,1,0,0,0,20,0,0,0,0,0,0.000000,0,0,0,0,0,0.000000,0,15.5,2.8,-4.06
51010,127528,1,1,3,9773,1,1,132.0,1,19,19,5,5,126.3,1,0,1,0,0,0,18,0,0,6,8,5,12.600000,0,0,6,9,3,13.000000,0,7.6,2.6,0.32


In [58]:
# Ensure the deployment data has the same features as the training data
deployment_features = dep_data[x_cat.columns]

In [59]:
# Use the trained model to make predictions on deployment data
deployment_predictions = catboost_model_def.predict(deployment_features)

# Convert predictions to DataFrame for better readability
deployment_predictions_df = pd.DataFrame(deployment_predictions, columns=['Predicted'])

In [60]:
deployment_predictions_df.head()

Unnamed: 0,Predicted
0,0
1,2
2,2
3,1
4,1
