In [None]:
import mlflow


from data_ingest import get_data

In [None]:
import pandas as pd


train_df, test_df = get_data("parulpandey/emotion-dataset")


In [3]:
train_df = train_df[train_df["label"].isin([1, 0])]
test_df = test_df[test_df["label"].isin([1, 0])]


In [None]:
from preprocess import normalized_sentence,remove_short_sentences

In [None]:


train_df["text"] = train_df["text"].apply(normalized_sentence)
test_df["text"] = test_df["text"].apply(normalized_sentence)


In [5]:
train_data = remove_short_sentences(train_df, "text", 4)
test_data = remove_short_sentences(test_df, "text", 4)


In [6]:
X_train = train_df["text"]
X_test = test_df["text"]


y_train = train_df["label"]
y_test = test_df["label"]


In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [8]:
pipeline = Pipeline(
    [("feature_transformer", None),
     
      ("model", None)]


)


In [9]:
feature_transformer_settings = {
    "feature_transformer": [CountVectorizer(), TfidfVectorizer()],
    "feature_transformer__max_features": [ 1000],
}

In [10]:
param_grid = [
    {
        **feature_transformer_settings,
        "model": [LogisticRegression(solver="liblinear")],
        "model__penalty": ["l1", "l2"],
    },
    {
        **feature_transformer_settings,
        "model": [DecisionTreeClassifier()],
        "model__max_depth": [5, 10, 15],
        "model__min_samples_split": [2, 5, 10],
    },
    {
        **feature_transformer_settings,
        "model": [RandomForestClassifier()],
        "model__n_estimators": [50, 100,200],
        "model__max_depth": [5, 10, 15],
    },
    {
        **feature_transformer_settings,
        "model": [GradientBoostingClassifier()],
        "model__n_estimators": [50, 100, 200],
        "model__max_depth": [2,3,5],
    },
]


In [11]:
scoring = {
    "accuracy": make_scorer(accuracy_score),
    "precision": make_scorer(precision_score, average="weighted"),
    "recall": make_scorer(recall_score, average="weighted"),
}


In [12]:
gs = GridSearchCV(
    estimator=pipeline, param_grid=param_grid, scoring=scoring, refit="accuracy"
)


In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.sklearn.autolog(max_tuning_runs=None)
mlflow.set_experiment(experiment_name="exp_second")

with mlflow.start_run():
    gs.fit(
        X_train,
        y_train,
    )


In [14]:
gs.best_params_

{'feature_transformer': TfidfVectorizer(),
 'feature_transformer__max_features': 1000,
 'model': LogisticRegression(solver='liblinear'),
 'model__penalty': 'l1'}