# Importing data

In [None]:
import pandas as pd

# Set the maximum number of rows to display to a large number
pd.set_option('display.max_columns', None)

In [None]:
from ucimlrepo import fetch_ucirepo

# Fetch dataset
data = fetch_ucirepo(id=697)

# Create a single DataFrame during import
df = data.data.original

# Check the structure of the DataFrame
df.head(20)


In [None]:
df.info()

In [None]:
# 4424 entries and 4424 non-null values in each column 
# But just to verify
df.isnull().all()

The dataset and information about it can be found here: https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success 

The following columns are categorical (some represented numerically):
* Marital status
* Application mode
* Course
* Daytime/evening attendance
* Previous qualification
* Nacionality
* Mother's qualification
* Father's qualification
* Mother's occupation
* Father's occupation
* Displaced
* Educational special needs
* Debtor
* Tuition fees up to date
* Gender
* Scholarship holder
* International
* Target

The following columns are numerical (discrete):
* Application order
* Age at enrollment
* Curricular units 1st sem (credited)             
* Curricular units 1st sem (enrolled)
* Curricular units 1st sem (evaluations)
* Curricular units 1st sem (approved)
* Curricular units 1st sem (without evaluations)
* Curricular units 2nd sem (credited)
* Curricular units 2nd sem (enrolled)
* Curricular units 2nd sem (evaluations)
* Curricular units 2nd sem (approved)
* Curricular units 2nd sem (without evaluations)

The following columns are numerical (continuous):
* Previous qualification (grade)
* Admission grade
* Curricular units 1st sem (grade)
* Curricular units 2nd sem (grade)
* Unemployment rate
* Inflation rate
* GDP

In [None]:
# Check the number of 0s in each column
for col in df.columns:
    zeros_in_col = (df[col] == 0).sum()
    print(f"Zeros in {col} column: {zeros_in_col}")

In [None]:
# Check the unique values in each column
# cols_not_to_check_unique_vals_in = ["Height", "Weight", ]
for col in df:
# if col not in cols_not_to_check_unique_vals_in:
        # print(col)
        print(f"{col} column no. unique values: {df[col].nunique()}")

In [None]:
# Continuous columns - Previous qualification (grade), Admission grade, Unemployment rate, Inflation rate, GDP
# Can be found here - https://storage.googleapis.com/kaggle-forum-message-attachments/1832313/17922/Features%20information.pdf
# Check the unique values in the discrete columns
cols_not_to_check_unique_vals_in = ["Previous qualification (grade)", "Admission grade", "Curricular units 1st sem (grade)", "Curricular units 2nd sem (grade)"]
for col in df.columns:
    if col not in cols_not_to_check_unique_vals_in:
        print(f"Unique values in {col} column: {df[col].unique()}")

In [None]:
# Do some plotting next to understand the data

categorical_cols = [
    "Marital Status",
    "Application mode",
    "Course",
    "Daytime/evening attendance",
    "Previous qualification",
    "Nacionality",
    "Mother's qualification",
    "Father's qualification",
    "Mother's occupation",
    "Father's occupation",
    "Displaced",
    "Educational special needs",
    "Debtor",
    "Tuition fees up to date",
    "Gender",
    "Scholarship holder",
    "International",
    "Target"
]

import matplotlib.pyplot as plt

for col in categorical_cols:
    print(col)
    value_counts = df[col].value_counts()
    value_counts.plot(kind='bar')
    plt.ylabel("Count")
    plt.xlabel(col)
    plt.grid()
    plt.title(f"{col} count")
    plt.show()

In [None]:
numerical_cols = [
    "Application order",
    "Age at enrollment",
    "Curricular units 1st sem (credited)",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)",
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (without evaluations)",
    "Curricular units 2nd sem (credited)",
    "Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (without evaluations)",
    "Previous qualification (grade)",
    "Admission grade",
    "Curricular units 1st sem (grade)",
    "Curricular units 2nd sem (grade)",
    "Unemployment rate",
    "Inflation rate",
    "GDP"
]

# for col in numerical_cols:
#     print(col)
#     value_counts = df[col].value_counts()
#     value_counts.plot(kind='hist')
#     plt.ylabel("Count")
#     plt.xlabel(col)
#     plt.show()

# for col in numerical_cols:
#     print(col)
#     plt.figure(figsize=(8, 6))

#     # Plot histogram with horizontal orientation
#     plt.hist(df[col], orientation='vertical', color='skyblue', edgecolor='black')

#     plt.xlabel("Count")         # Count on the x-axis
#     plt.ylabel(col)             # Column values on the y-axis
#     plt.title(f"Histogram of {col}")
#     plt.grid(axis='x', linestyle='--', alpha=0.6)
#     plt.show()

# for col in numerical_cols:
#     print(col)
#     plt.figure(figsize=(8, 6))
    
#     # Plot boxplot
#     plt.boxplot(df[col], vert=True, patch_artist=True, boxprops=dict(facecolor='skyblue'))
    
#     plt.ylabel(col)             # Column values on the y-axis
#     plt.title(f"Boxplot of {col}")
#     plt.grid(axis='y', linestyle='--', alpha=0.6)
#     plt.show()

for col in numerical_cols:
    print(col)
    plt.figure(figsize=(8, 6))
    
    # Plot boxplot
    plt.boxplot(df[col], vert=True, patch_artist=True, boxprops=dict(facecolor='skyblue'))
    
    plt.ylabel(col)             # Column values on the y-axis
    plt.title(f"Boxplot of {col}")
    plt.grid(axis='y', linestyle='--', alpha=0.6)
    
    # Remove or customize the x-axis tick label
    plt.xticks([1], [''])  # This removes the label or you can replace '' with a custom label
    
    plt.show()


In [None]:
def handle_outliers(col):
    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1
    # print("IQR used")
    outlier_mask = (col < Q1 - 1.5 * IQR) | (col > Q3 + 1.5 * IQR)
    outlier_values = col[outlier_mask].unique().tolist()  # Remove duplicates and convert to list
    return outlier_values

In [None]:
outlier_cols = []
for col in numerical_cols:
    outliers = handle_outliers(df[col])
    if outliers == []:
        print(f"No outliers in {col} column\n")
    else:
        outlier_cols.append(col)
        print(f"Outliers in {col} column:")
        print(f"{outliers}\n")

In [None]:
for col in outlier_cols:
    print(col)
    plt.figure(figsize=(8, 6))

    # Plot histogram with horizontal orientation
    plt.hist(df[col], orientation='vertical', color='skyblue', edgecolor='black')

    plt.xlabel("Count")         # Count on the x-axis
    plt.ylabel(col)             # Column values on the y-axis
    plt.title(f"Histogram of {col}")
    plt.grid(axis='x', linestyle='--', alpha=0.6)
    plt.show()

In [None]:
outlier_cols

In [None]:
df.head(20)

### Investigating outliers
Outliers in the application order columns we can assume are meaningful 


Preprocessing:
Handle missing values (e.g., imputation or removal).
Encode categorical features (e.g., one-hot encoding).
Normalize or standardize numerical features (especially for Logistic Regression).
Split the data into training and testing sets (e.g., 80-20 split).
Benchmark Model:
Start with Logistic Regression to establish a baseline.
Fine-tune hyperparameters (e.g., C, solver, multi_class).
Improve with Random Forest:
Train and evaluate a Random Forest model.
Fine-tune hyperparameters (e.g., n_estimators, max_depth, max_features).
Improve Further with XGBoost:
Train and evaluate an XGBoost model.
Fine-tune hyperparameters (e.g., learning_rate, max_depth, n_estimators).
Interpretability:
Use SHAP or LIME to explain the predictions of your final model (e.g., XGBoost).

# What can I check next

Before using your multi-class classification data with logistic regression, you should perform several checks to ensure the data is suitable for the model. Logistic regression has specific assumptions and requirements, and your data should meet these for the model to perform well. Here's a checklist:

---

### 1. **Check the Target Variable**
   - **Encoding**: The target variable should be encoded as integers (e.g., 0, 1, 2, ...) for multi-class logistic regression. Some libraries (like scikit-learn) require this.
   - **Class balance**: Check if the classes are balanced. Highly imbalanced classes can lead to poor performance. If imbalanced, consider techniques like oversampling, undersampling, or class weighting.

---

### 2. **Check Feature Types**
   - **Numerical features**: Logistic regression works best with numerical features. If you have categorical features, encode them (e.g., one-hot encoding, label encoding).
   - **Scale of features**: Logistic regression is sensitive to the scale of features. Ensure all features are scaled (e.g., using standardization or normalization).

---

### 3. **Check for Multicollinearity**
   - Logistic regression assumes that features are not highly correlated with each other. Use techniques like:
     - Correlation matrix: Check for high correlations between features.
     - Variance Inflation Factor (VIF): Values above 5-10 indicate multicollinearity.
   - If multicollinearity exists, consider removing or combining features.

---

### 4. **Check for Linearity**
   - Logistic regression assumes a linear relationship between features and the log-odds of the target variable. You can check this by:
     - Plotting partial dependence plots.
     - Using polynomial features if non-linear relationships exist.
   - If the relationship is highly non-linear, consider using a different model (e.g., decision trees, SVM).

---

### 5. **Check for Outliers**
   - Logistic regression can be sensitive to outliers. Check for outliers in your features using:
     - Boxplots.
     - Z-scores or IQR (Interquartile Range).
   - Handle outliers by removing them or transforming the data (e.g., log transformation).

---

### 6. **Check for Missing Values**
   - Logistic regression cannot handle missing values directly. Check for missing values in your data and handle them by:
     - Imputation (e.g., mean, median, mode).
     - Removing rows or columns with missing values (if minimal).

---

### 7. **Check Sample Size**
   - Ensure you have enough samples for each class. Logistic regression requires a sufficient number of samples to estimate the coefficients reliably.
   - A rule of thumb is to have at least 10 samples per feature per class.

---

### 8. **Check for Overfitting**
   - Logistic regression can overfit if there are too many features relative to the number of samples. Check the feature-to-sample ratio and consider:
     - Feature selection (e.g., using L1 regularization).
     - Dimensionality reduction (e.g., PCA).

---

### 9. **Check for Regularization Needs**
   - Logistic regression benefits from regularization (L1 or L2) to prevent overfitting, especially if you have many features. Decide whether to use regularization and tune the regularization parameter.

---

### 10. **Check for Interpretability**
   - Logistic regression is interpretable, so ensure your features are meaningful and interpretable. Avoid using overly complex or engineered features that might reduce interpretability.

---

### 11. **Check for Software/Library Requirements**
   - Ensure your software/library supports multi-class logistic regression. For example:
     - In scikit-learn, use `LogisticRegression` with `multi_class='multinomial'` for multi-class problems.
     - In statsmodels, use `MNLogit` for multi-class logistic regression.
     

---

### Summary
Before using logistic regression for multi-class classification, ensure:
- The target variable is properly encoded and balanced.
- Features are numerical, scaled, and free of multicollinearity.
- The data is free of outliers and missing values.
- The relationship between features and the target is approximately linear.
- You have enough samples and features are interpretable.

If your data meets these criteria, it is likely suitable for logistic regression. If not, consider preprocessing or using a different model.

# Preprocessing data

In [None]:
df.head()

# ML workflow

In [None]:
# !pip install scikit.learn
from sklearn.model_selection import train_test_split
X = df.drop(columns=["Target"])
y = df['Target']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
for original_class, encoded_value in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(f"Original Class: {original_class} -> Encoded Value: {encoded_value}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Decision Tree - benchmark workflow

In [None]:
X_train.head()

In [None]:
y_test[:5]

In [None]:
from sklearn import tree
# clf = tree.DecisionTreeClassifier()
clf = tree.DecisionTreeClassifier(max_depth=4)
clf = clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score
# Evaluate on training and testing sets
train_accuracy = accuracy_score(y_train, clf.predict(X_train))
test_accuracy = accuracy_score(y_test, clf.predict(X_test))

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

Testing Accuracy and Training accuracy show that there is no underfitting or overfitting - model is fitting just right.

In [None]:
### Go back and improve the decision tree using a grid search, looking at other features and stuff like that


# Random forest

In [None]:
# param_grid = {
#     'n_estimators':[20, 40, 60, 80, 100, 120, 140, 160, 180, 200],
#     'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
#     'max_features':['sqrt', 'log2', None],
#     'min_samples_split':[2, 3, 4, 5, 6, 7, 8, 9, 10],
#     'min_samples_leaf':[1, 2, 3, 4, 5] 
# }

param_grid = {
    'n_estimators':[20, 40, 60, 80, 100, 120, 140, 160, 180, 200],
    'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'max_features':['sqrt', 'log2', None],
    'min_samples_split':[2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf':[1, 2, 3, 4, 5] 
}

In [None]:
# from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
# grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
# grid_search.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    rf, param_grid, n_iter=1000, cv=5, scoring="accuracy", n_jobs=-1, random_state=42, return_train_score=True
)
random_search.fit(X_train, y_train)

In [None]:
import pandas as pd

# Convert cv_results_ to a DataFrame
results_df = pd.DataFrame(random_search.cv_results_)

# Sort by the mean test score (accuracy) in descending order
sorted_results = results_df.sort_values(by='mean_test_score', ascending=False)

# Get the top 3 or 5 parameter combinations
top_n = 3  # Change this to 5 if you want the top 5
top_combinations = sorted_results.head(top_n)

# Display the top parameter combinations and their scores
print(f"Top {top_n} Parameter Combinations:")
print(top_combinations[['params', 'mean_test_score']])

In [None]:
sorted_results

In [None]:
temp_df = sorted_results[sorted_results["mean_train_score"]>=0.7]

In [None]:
temp_df['param_min_samples_leaf'].unique()

In [None]:
# Step 1: Calculate the absolute difference between mean_train_score and mean_test_score
temp_df['score_diff'] = abs(temp_df['mean_train_score'] - temp_df['mean_test_score'])

# Step 2: Filter rows where the difference is at most 6% (0.06)
filtered_df = temp_df[temp_df['score_diff'] <= 0.05]
filtered_df

In [None]:
filtered_df.head(50)

In [None]:
import os
import multiprocessing

# Method 1: Using os.cpu_count()
print(f"Number of CPU cores (os.cpu_count()): {os.cpu_count()}")

# Method 2: Using multiprocessing.cpu_count()
print(f"Number of CPU cores (multiprocessing.cpu_count()): {multiprocessing.cpu_count()}")

In [None]:
filtered_df[filtered_df["mean_train_score"]>=0.7]

In [None]:
##### ADD CLASSIFICATION REPORT, CONFUSION MATRIX, ACCURACY, RECALL, PRECISION, F1 SCORE - IN THE DECISION TREE STUFF TOO

In [None]:
filtered_df['param_max_depth'].unique()

In [None]:
print(f"Best Parameters for Random Forest: {random_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {random_search.best_score_:.2f}")

In [None]:
rf = RandomForestClassifier(random_state=42, n_estimators=180, min_samples_split=9, min_samples_leaf=3, max_features='sqrt', max_depth=6)
rf.fit(X_train, y_train)
train_accuracy = accuracy_score(y_train, rf.predict(X_train))
test_accuracy = accuracy_score(y_test, rf.predict(X_test))

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

In [None]:
pd.DataFrame(random_search.cv_results_)

In [None]:
# print(f"Best Parameters for Random Forest: {grid_search.best_params_}")
# print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.2f}")

In [None]:
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import accuracy_score
# Evaluate on training and testing sets
train_accuracy = accuracy_score(y_train, rf.predict(X_train))
test_accuracy = accuracy_score(y_test, rf.predict(X_test))

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

# XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Initialize the XGBoost classifier
xgb = XGBClassifier()

# Train the model
xgb.fit(X_train, y_train)

# Make predictions
y_pred = xgb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
# Train XGBoost model
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=18,
    learning_rate=0.1,
    random_state=42
)
xgb.fit(X_train, y_train)

# Make predictions
y_pred = xgb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
help(XGBClassifier)

In [None]:
!pip install scikit-learn==1.5.2
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Initialize XGBClassifier
xgb = XGBClassifier(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.5),
    'min_child_weight': randint(1, 6),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1),
    'objective': ['multi:softmax'],
    'num_class': [3],
    'eval_metric': ['mlogloss']
}

# Run RandomizedSearchCV
random_search = RandomizedSearchCV(
    xgb, param_grid, n_iter=100, cv=5, scoring="accuracy", n_jobs=-1, random_state=42
)
random_search.fit(X_train, y_train)

# Get the best parameters
print("Best Parameters:", random_search.best_params_)

In [None]:
xgb = XGBClassifier(colsample_bytree=0.8940284175215543, eval_metric='mlogloss', gamma=0.4017404651924243, learning_rate=0.09461037177139194, max_depth=4, min_child_weight=5, n_estimators=148, num_class=3, objective='multi:softmax', reg_alpha=0.4126176769114265, reg_lambda=0.37201808579278317, subsample=0.9105651842967988,random_state=42)

In [None]:
from sklearn.metrics import accuracy_score
# Evaluate on training and testing sets
xgb = xgb.fit(X_train, y_train)

train_accuracy = accuracy_score(y_train, xgb.predict(X_train))
test_accuracy = accuracy_score(y_test, xgb.predict(X_test))

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

In [None]:
import sklearn
import xgboost

print("scikit-learn version:", sklearn.__version__)
print("XGBoost version:", xgboost.__version__)

In [None]:
!pip install shap
import shap

# Train an XGBoost model
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)

# Explain the model's predictions using SHAP
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test)

# Plot SHAP summary
shap.summary_plot(shap_values, X_test)

# Stacking

In [None]:
# Define base models
base_models = [
    ('random_forest', RandomForestClassifier(random_state=42,max_depth=6)),
    ('decision_tree', tree.DecisionTreeClassifier(random_state=42,max_depth=4)),
    ('xgboost', XGBClassifier(random_state=42))
]

In [None]:
# Define meta-model (Decision Tree)
meta_model = tree.DecisionTreeClassifier(random_state=42,max_depth=4)

In [None]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
# Create stacking classifier
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,  # Use 5-fold cross-validation to generate base model predictions
    stack_method='auto',  # Use 'predict_proba' if possible, otherwise 'predict'
    n_jobs=-1  # Use all available CPU cores
)

In [None]:
# Train the stacking model
stacking_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = stacking_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Stacking Model Accuracy: {accuracy:.4f}")

In [None]:
!pip install seaborn

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
cm