# Using MLflow

### Start by preprocessing your data

In [3]:
import numpy as np
import pandas as pd 
names = ["time","gender","age","course","year","cgpa","marital","depression","anxiety","panic","treatement"]
data = pd.read_csv("Student Mental health.csv")
data.columns = names

data[:5]

Unnamed: 0,time,gender,age,course,year,cgpa,marital,depression,anxiety,panic,treatement
0,8/7/2020 12:02,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,8/7/2020 12:04,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,8/7/2020 12:05,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,8/7/2020 12:06,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,8/7/2020 12:13,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   time        101 non-null    object 
 1   gender      101 non-null    object 
 2   age         100 non-null    float64
 3   course      101 non-null    object 
 4   year        101 non-null    object 
 5   cgpa        101 non-null    object 
 6   marital     101 non-null    object 
 7   depression  101 non-null    object 
 8   anxiety     101 non-null    object 
 9   panic       101 non-null    object 
 10  treatement  101 non-null    object 
dtypes: float64(1), object(10)
memory usage: 8.8+ KB


In [5]:
data = data.drop(["time","course","age"], axis=1)
data[5:]

Unnamed: 0,gender,year,cgpa,marital,depression,anxiety,panic,treatement
5,Male,Year 2,3.50 - 4.00,No,No,No,Yes,No
6,Female,year 2,3.50 - 4.00,Yes,Yes,No,Yes,No
7,Female,year 1,3.50 - 4.00,No,No,Yes,No,No
8,Female,Year 2,2.50 - 2.99,No,No,No,No,No
9,Male,year 1,3.50 - 4.00,No,No,Yes,Yes,No
...,...,...,...,...,...,...,...,...
96,Female,year 1,3.50 - 4.00,No,No,Yes,No,No
97,Male,Year 2,3.00 - 3.49,No,Yes,Yes,No,No
98,Female,Year 3,3.50 - 4.00,Yes,Yes,No,Yes,No
99,Female,year 4,3.50 - 4.00,No,No,No,No,No


In [6]:
gender_mapping = {'Male': 0, 'Female': 1}
data['gender'] = data['gender'].map(gender_mapping)

mapping = {'No': 0, 'Yes': 1}
data['marital'] = data['marital'].map(mapping)
data['depression'] = data['depression'].map(mapping)
data['anxiety'] = data['anxiety'].map(mapping)
data['panic'] = data['panic'].map(mapping)
data['treatement'] = data['treatement'].map(mapping)

yearmap = {'year 1': 1, 'Year 1': 1,'year 2': 2, 'Year 2': 2,'year 3': 3, 'Year 3': 3,'year 4': 4, 'Year 4': 4}
data["year"]= data['year'].map(yearmap)

data[['mingpa', 'maxgpa']] = data['cgpa'].str.split(' - ', expand=True)
data = data.drop("cgpa", axis=1)
data['mingpa'] = data['mingpa'].astype(float)
data['maxgpa'] = data['maxgpa'].astype(float)

In [7]:
data[:5]

Unnamed: 0,gender,year,marital,depression,anxiety,panic,treatement,mingpa,maxgpa
0,1,1,0,1,0,1,0,3.0,3.49
1,0,2,0,0,1,0,0,3.0,3.49
2,0,1,0,1,1,1,0,3.0,3.49
3,1,3,1,1,0,0,0,3.0,3.49
4,0,4,0,0,0,0,0,3.0,3.49


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   gender      101 non-null    int64  
 1   year        101 non-null    int64  
 2   marital     101 non-null    int64  
 3   depression  101 non-null    int64  
 4   anxiety     101 non-null    int64  
 5   panic       101 non-null    int64  
 6   treatement  101 non-null    int64  
 7   mingpa      101 non-null    float64
 8   maxgpa      101 non-null    float64
dtypes: float64(2), int64(7)
memory usage: 7.2 KB


In [9]:
X = data.drop('depression', axis=1)
y = data['depression']

In [10]:
### Feature selection

In [11]:
from sklearn.feature_selection import mutual_info_classif
feature_names = ['gender', 'year', 'marital', 'anxiety', 'panic', ' treatement', 'mingpa','maxgpa']
features = []
MI_score = mutual_info_classif(X, y, random_state=0)
for feature in zip(feature_names, MI_score):
    features.append(feature)
features

[('gender', 0.0),
 ('year', 0.0),
 ('marital', 0.1615084419278734),
 ('anxiety', 0.02041364971807047),
 ('panic', 0.0),
 (' treatement', 0.07032251278482793),
 ('mingpa', 0.0),
 ('maxgpa', 0.0)]

In [12]:
#X = X.drop(["gender","year","panic","mingpa","maxgpa"], axis=1)
X[5:]

Unnamed: 0,gender,year,marital,anxiety,panic,treatement,mingpa,maxgpa
5,0,2,0,0,1,0,3.5,4.00
6,1,2,1,0,1,0,3.5,4.00
7,1,1,0,1,0,0,3.5,4.00
8,1,2,0,0,0,0,2.5,2.99
9,0,1,0,1,1,0,3.5,4.00
...,...,...,...,...,...,...,...,...
96,1,1,0,1,0,0,3.5,4.00
97,0,2,0,1,0,0,3.0,3.49
98,1,3,1,0,1,0,3.5,4.00
99,1,4,0,0,0,0,3.5,4.00


In [13]:
# import train-test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Train 5 models

In [14]:
### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
model_lg = LogisticRegression(max_iter=120,random_state=0, n_jobs=20)
model_lg.fit(X_train, y_train)
pred_lg = model_lg.predict(X_test)
lg = accuracy_score(y_test, pred_lg)
cm1 = confusion_matrix(y_test, pred_lg)
print("accuracy :",lg)
print(classification_report(y_test,pred_lg))

accuracy : 0.7647058823529411
              precision    recall  f1-score   support

           0       0.77      0.91      0.83        22
           1       0.75      0.50      0.60        12

    accuracy                           0.76        34
   macro avg       0.76      0.70      0.72        34
weighted avg       0.76      0.76      0.75        34



In [16]:
### Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier( max_depth=4, random_state=42)
model_dt.fit(X_train,y_train)
pred_dt = model_dt.predict(X_test)
dt = accuracy_score(y_test, pred_dt)
cm2 = confusion_matrix(y_test, pred_dt)
print("accuracy :",dt)
print(classification_report(y_test,pred_dt))

accuracy : 0.7647058823529411
              precision    recall  f1-score   support

           0       0.77      0.91      0.83        22
           1       0.75      0.50      0.60        12

    accuracy                           0.76        34
   macro avg       0.76      0.70      0.72        34
weighted avg       0.76      0.76      0.75        34



In [18]:
### Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=300,min_samples_leaf=0.16, random_state=42)
model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_test)
rf = accuracy_score(y_test, pred_rf)
cm3 = confusion_matrix(y_test, pred_rf)
print("accuracy :",rf)
print(classification_report(y_test,pred_rf))

accuracy : 0.6470588235294118
              precision    recall  f1-score   support

           0       0.65      1.00      0.79        22
           1       0.00      0.00      0.00        12

    accuracy                           0.65        34
   macro avg       0.32      0.50      0.39        34
weighted avg       0.42      0.65      0.51        34



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
### SVM

In [21]:
import mlflow
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
mlflow.end_run()
mlflow.start_run()
mlflow.end_run()
svm_model = SVC(kernel='linear') 
svm_model.fit(X_train, y_train)
predictions = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")
report = classification_report(y_test,predictions)
print(classification_report(y_test,predictions))

Accuracy: 0.8235294117647058
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        22
           1       1.00      0.50      0.67        12

    accuracy                           0.82        34
   macro avg       0.89      0.75      0.77        34
weighted avg       0.86      0.82      0.80        34



In [22]:
### Xgboost

In [23]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test,predictions))

Accuracy: 0.8235294117647058
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        22
           1       1.00      0.50      0.67        12

    accuracy                           0.82        34
   macro avg       0.89      0.75      0.77        34
weighted avg       0.86      0.82      0.80        34



### Track models performance

In [54]:
import mlflow
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import numpy as np

models = {
    'SVC_linear': SVC(kernel='linear'),
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100),
    'LogisticRegression': LogisticRegression(max_iter=100),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(n_estimators=100)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f'{name} MSE: {mse}')

SVC_linear MSE: 0.23809523809523808
RandomForestClassifier MSE: 0.2857142857142857
LogisticRegression MSE: 0.23809523809523808
DecisionTreeClassifier MSE: 0.2857142857142857
GradientBoostingClassifier MSE: 0.2857142857142857


### Track models accuracy

In [24]:
#mlflow.set_tracking_uri("http://localhost:1234")

In [53]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import sklearn
print("Scikit-Learn: {}".format(sklearn.__version__))
print("MLFlow: {}".format(mlflow.__version__))

mlflow.end_run()

models = [
    SVC(kernel='linear'),
    RandomForestClassifier(n_estimators=100),
    LogisticRegression(max_iter=1000),
    DecisionTreeClassifier(),
    GradientBoostingClassifier(n_estimators=100)

]


for idx, model in enumerate(models):
    with mlflow.start_run(run_name=f"Model_{idx + 1}"):
        model.fit(X_train, y_train)
        
        predictions = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, predictions)
        print(f"Model_{idx + 1} Accuracy: {accuracy}")
        
        mlflow.log_param('model_name', model.__class__.__name__)
        mlflow.log_metric('accuracy', accuracy)
        
        mlflow.sklearn.log_model(model, f"Model_{idx + 1}_Skl_Model")

# End MLflow run
mlflow.end_run()

Scikit-Learn: 1.3.2
MLFlow: 2.7.1
Model_1 Accuracy: 0.7619047619047619
Model_2 Accuracy: 0.7142857142857143
Model_3 Accuracy: 0.7619047619047619
Model_4 Accuracy: 0.7142857142857143
Model_5 Accuracy: 0.7142857142857143


### Save your best model in ONNX format and its dedicated preprocessing transformations(i.e., using transformers API) in pickle format

In [26]:
#pip install skl2onnx

In [27]:
import skl2onnx
from skl2onnx import to_onnx

best_model = models[0] #SVM
onx = to_onnx(best_model, X[:1])
with open("svm_model.onnx", "wb") as f:
    f.write(onx.SerializeToString())



In [28]:
import pickle

preprocessing_transformations = {
    'gender_mapping': gender_mapping,
    'mapping': mapping,
    'yearmap': yearmap
}

with open('preprocessing_transformations.pkl', 'wb') as file:
    pickle.dump(preprocessing_transformations, file)

## Using FastAPI - this is just a code represenation there is a separate file for this

## Using Flask -  this is just a code represenation there is a separate file for this