# Deploy pipeline: load, preprocess, train, and export model

This notebook loads `data.csv` (delimiter autodetected), applies the same column drops used in your training notebook, coerces numeric columns, trains a StandardScaler+RandomForest pipeline, and writes `multiclass_classification_model.pkl` and `final_feature_list.txt` to the project root so `app.py` can load them directly.

Run cells in order. If your CSV uses `;` as delimiter, the loader will detect it automatically.

In [1]:
# 1) Imports
import io
import os
import csv
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

print('Imports OK')

Imports OK


In [2]:
# 2) Robust CSV loader (auto-detect delimiter)
csv_path = 'data.csv'
assert os.path.exists(csv_path), f'{csv_path} not found in repo root'

# Read a sample to sniff the delimiter
with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f:
    sample = f.read(8192)
    dialect = csv.Sniffer().sniff(sample, delimiters=[',',';','	','|'])
    delim = dialect.delimiter

print(f'Auto-detected delimiter: 
')

df = pd.read_csv(csv_path, sep=delim, engine='python')
print('Data loaded. Shape:', df.shape)
print('Columns (first 30):', df.columns.tolist()[:30])

SyntaxError: unterminated f-string literal (detected at line 11) (1881680382.py, line 11)

In [None]:
# 3) Clean column names and create target mapping
df.columns = df.columns.str.strip()
# Ensure Target exists
if 'Target' not in df.columns:
    raise RuntimeError('Target column not found in data.csv')

# Map Target to numeric consistent with app.py and notebook: Enrolled=0, Dropout=1, Graduate=2
df['Target_binary'] = df['Target'].replace({'Graduate': 2, 'Dropout': 1, 'Enrolled': 0})
print('Target mapping done. Unique values:', df['Target'].unique(), '->', df['Target_binary'].unique())

In [None]:
# 4) Drop the columns the original training notebook removed (safe drop)
features_to_drop = [
    'Application mode', 'Application order', 'Previous qualification', 'Previous qualification (grade)',
    
, 
, 
, 
,
    'Admission grade', 'Debtor', 'International', 'Curricular units 1st sem (credited)',
    'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (without evaluations)',
    'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (evaluations)',
    'Curricular units 2nd sem (without evaluations)'
]
# Build feature matrix X and target y (do not include Target itself)
X = df.drop(columns=['Target', 'Target_binary'], errors='ignore')
existing_features_to_drop = [c for c in features_to_drop if c in X.columns]
print(f'Dropping {len(existing_features_to_drop)} columns: {existing_features_to_drop}')
X = X.drop(columns=existing_features_to_drop)
y = df.loc[X.index, 'Target_binary']
print('Remaining features count:', X.shape[1])
print('Remaining feature list (first 50):', X.columns.tolist()[:50])

In [None]:
# 5) Coerce numeric-like columns to numeric and handle missing values
# Try to convert every column to numeric where possible; keep booleans or categorical as-is if conversion fails
non_numeric_cols = []
for col in X.columns:
    # if dtype is object, try to coerce to numeric
    if X[col].dtype == 'object':
        coerced = pd.to_numeric(X[col].str.replace(',', '.').astype(str).str.strip(), errors='coerce')
        # If many values converted (not all NaN), use coerced numeric; otherwise leave original
        if coerced.notna().sum() > 0:
            X[col] = coerced
        else:
            non_numeric_cols.append(col)

print('Non-numeric columns remaining (treated as categorical/text):', non_numeric_cols)

# After coercion, drop rows with any NaNs to keep simple and consistent for training
nan_counts = X.isnull().sum().sum()
print('Total NaN values in features after coercion:', nan_counts)
if nan_counts > 0:
    X_before = X.shape[0]
    mask = X.notnull().all(axis=1)
    X = X.loc[mask].copy()
    y = y.loc[X.index].copy()
    print(f'Dropped {X_before - X.shape[0]} rows with NaNs; new shape: {X.shape}')
else:
    print('No NaNs to drop')

In [None]:
# 6) Quick train/validation split to check model performance, then train on full data and export
if X.shape[0] < 10:
    raise RuntimeError('Not enough rows after cleaning to train model')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train/Test shapes:', X_train.shape, X_test.shape)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, class_weight='balanced'))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print('Validation accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Refit on full data for export to maximize data used
pipeline_full = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, class_weight='balanced'))
])
pipeline_full.fit(X, y)
print('Trained pipeline on full data.')

In [None]:
# 7) Save the pipeline and final feature list (in the project root so app.py can load them)
model_path = 'multiclass_classification_model.pkl'
joblib.dump(pipeline_full, model_path)
print('Saved model to', model_path)

# Save feature list preserving column order
with open('final_feature_list.txt', 'w', encoding='utf-8') as f:
    for feat in X.columns:
        f.write(f'{feat}
')
print('Saved final_feature_list.txt with', len(X.columns), 'features')

In [None]:
# 8) Quick load test: load saved model and predict on first row (if any)
loaded = joblib.load('multiclass_classification_model.pkl')
print('Loaded model. Classes:', getattr(loaded, 'classes_', None))
if X.shape[0] > 0:
    sample = X.iloc[[0]]
    print('Sample shape:', sample.shape)
    pred = loaded.predict(sample)[0]
    proba = loaded.predict_proba(sample)[0]
    print('Prediction:', pred)
    print('Probabilities:', proba)
else:
    print('No sample to test')