# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

In [2]:
train_df = pd.read_csv("/kaggle/input/playground-series-s4e6/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s4e6/test.csv")

# Basic EDA

In [3]:
train_df.head()

Unnamed: 0,id,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,1,1,1,9238,1,1,126.0,1,1,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,1,17,1,9238,1,1,125.0,1,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,2,1,17,2,9254,1,1,137.0,1,3,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,3,1,1,3,9500,1,1,131.0,1,19,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,4,1,1,2,9500,1,1,132.0,1,19,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   id                                              76518 non-null  int64  
 1   Marital status                                  76518 non-null  int64  
 2   Application mode                                76518 non-null  int64  
 3   Application order                               76518 non-null  int64  
 4   Course                                          76518 non-null  int64  
 5   Daytime/evening attendance                      76518 non-null  int64  
 6   Previous qualification                          76518 non-null  int64  
 7   Previous qualification (grade)                  76518 non-null  float64
 8   Nacionality                                     76518 non-null  int64  
 9   Mother's qualification                 

In [5]:
train_df.isna().sum()

id                                                0
Marital status                                    0
Application mode                                  0
Application order                                 0
Course                                            0
Daytime/evening attendance                        0
Previous qualification                            0
Previous qualification (grade)                    0
Nacionality                                       0
Mother's qualification                            0
Father's qualification                            0
Mother's occupation                               0
Father's occupation                               0
Admission grade                                   0
Displaced                                         0
Educational special needs                         0
Debtor                                            0
Tuition fees up to date                           0
Gender                                            0
Scholarship 

In [6]:
X = train_df.drop(columns = ['Target'])
y = train_df['Target']

# Label Encoding

In [7]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# XGB Classifier

In [9]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [10]:
model.fit(X_train, y_train)

# Base Model Evaluation

In [11]:
y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
c_report = classification_report(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print(f"Classification Report: {c_report}")

Validation Accuracy: 0.8299137480397282
Classification Report:               precision    recall  f1-score   support

           0       0.89      0.83      0.86      5028
           1       0.65      0.61      0.63      3017
           2       0.86      0.92      0.89      7259

    accuracy                           0.83     15304
   macro avg       0.80      0.79      0.79     15304
weighted avg       0.83      0.83      0.83     15304



# Hyperparameter Tuning

In [12]:
from scipy.stats import uniform, randint

param_dist = {
    'n_estimators': randint(50, 200),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

In [13]:
xg_class_cv = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [14]:
random_search = RandomizedSearchCV(estimator=xg_class_cv, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, verbose=2, random_state=42)

In [15]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [16]:
best_model = random_search.best_estimator_

# Tuned Model Evaluation

In [17]:
y_val_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
c_report = classification_report(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy}")
print(f"Classification report: {c_report}")

Validation Accuracy: 0.8336382645060115
Classification report:               precision    recall  f1-score   support

           0       0.90      0.83      0.87      5028
           1       0.66      0.62      0.64      3017
           2       0.86      0.93      0.89      7259

    accuracy                           0.83     15304
   macro avg       0.81      0.79      0.80     15304
weighted avg       0.83      0.83      0.83     15304



# Predictions on Test Set

In [18]:
test_predictions_encoded = best_model.predict(test_df)

In [19]:
test_predictions = label_encoder.inverse_transform(test_predictions_encoded)

In [20]:
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Target': test_predictions
})

In [21]:
submission_df.to_csv('submission.csv', index=False)

In [22]:
submission_df

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Enrolled
4,76522,Enrolled
...,...,...
51007,127525,Dropout
51008,127526,Dropout
51009,127527,Dropout
51010,127528,Dropout


[CV] END colsample_bytree=0.6624074561769746, learning_rate=0.05679835610086079, max_depth=5, n_estimators=137, subsample=0.7334834444556088; total time=   9.4s
[CV] END colsample_bytree=0.6571467271687763, learning_rate=0.20526654188465587, max_depth=7, n_estimators=51, subsample=0.8887995089067299; total time=   4.6s
[CV] END colsample_bytree=0.9754210836063001, learning_rate=0.010233629752304298, max_depth=6, n_estimators=70, subsample=0.8469926038510867; total time=   6.3s
[CV] END colsample_bytree=0.8446612641953124, learning_rate=0.012119891565915222, max_depth=3, n_estimators=98, subsample=0.8099098641033556; total time=   5.3s
[CV] END colsample_bytree=0.7599443886861021, learning_rate=0.023999698964084628, max_depth=6, n_estimators=64, subsample=0.7824279936868144; total time=   5.3s
[CV] END colsample_bytree=0.9439761626945282, learning_rate=0.2140922615763339, max_depth=3, n_estimators=67, subsample=0.9795542149013333; total time=   3.6s
[CV] END colsample_bytree=0.943976162