In [81]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import mlflow

In [82]:
data = pd.read_csv("Churn.csv")

In [83]:
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,0,42,2.0,0.00,1,1,1,101348.88,1
1,608,2,0,41,1.0,83807.86,1,0,1,112542.58,0
2,502,0,0,42,8.0,159660.80,3,1,0,113931.57,1
3,699,0,0,39,1.0,0.00,2,0,0,93826.63,0
4,850,2,0,43,2.0,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5.0,0.00,2,1,0,96270.64,0
9996,516,0,1,35,10.0,57369.61,1,1,1,101699.77,0
9997,709,0,0,36,7.0,0.00,1,0,1,42085.58,1
9998,772,1,1,42,3.0,75075.31,2,1,0,92888.52,1


In [84]:
x_train, x_test, y_train, y_test = train_test_split(
    data.drop(['Exited'], axis=1), data['Exited'], random_state=2023)

__MLFLOW__

In [86]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
experiment_id = mlflow.create_experiment("grid_search_new")

In [90]:
crf = RandomForestClassifier()

param_grid = {
    'n_estimators': [8, 15, 20],
    'max_leaf_nodes': [25, 30, 50],
    'max_depth': [1, 5, 6]
}

In [91]:
grid = GridSearchCV(crf, param_grid, cv=5)

In [92]:
with mlflow.start_run(experiment_id=experiment_id):
    grid.fit(x_train, y_train)

    mlflow.log_param("n_estimators", grid.best_params_['n_estimators'])
    mlflow.log_param("max_leaf_nodes", grid.best_params_['max_leaf_nodes'])
    mlflow.log_param("max_depth", grid.best_params_['max_depth'])
    
    mlflow.log_metric("score", grid.score(x_test, y_test))
    mlflow.log_metric("accuracy_score", accuracy_score(grid.predict(x_test), y_test))
    mlflow.log_metric("precision_score", precision_score(grid.predict(x_test), y_test))
    mlflow.log_metric("recall_score", recall_score(grid.predict(x_test), y_test))
    mlflow.log_metric("f1_score", f1_score(grid.predict(x_test), y_test))
    
    mlflow.sklearn.log_model(grid.best_estimator_, 'best_rf_model')

__Pipeline__

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

In [None]:
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)

In [None]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [None]:
pipeline = Pipeline([
    ('vect',CountVectorizer()),
    ('tfidf',TfidfTransformer()),
    ('clf',LinearSVC())
])

In [None]:
param_grid = [
    {
        'vect__max_df':[0.8, 0.9, 1.0],
        'clf__penalty':['l1', 'l2'],
        'clf__C':[0.5, 0.7, 1]
    }
]

In [None]:
grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid, scoring='f1')
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
pipeline.set_params(clf__penalty='l2', vect__max_df=0.9, clf__C=1)
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)

In [None]:
f1_score(y_test, y_preds)