In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import pickle

In [2]:
# Parameters
# Folder for saving models
models_folder = "./models"
dataset = "./data_sample.csv"

# Loading the dataset
df = pd.read_csv(dataset)

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['name'])

# Label Encoding
le = LabelEncoder()
y = le.fit_transform(df['main_category'])

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
def model_train(model, model_name:str):
    print(f"Started training {model_name}.")
    model.fit(X_train, y_train)

    # Evaluation
    y_pred = model.predict(X_test)
    with open(f"{models_folder}/{model_name}_eval.txt", "w") as file:
        file.write(classification_report(y_test, y_pred, target_names=le.classes_))

    # Save the model
    with open(f'{models_folder}/{model_name}.pkl', 'wb') as file:
        pickle.dump(model, file)

In [None]:
# Train 4 models
model_train(LogisticRegression(), "logisticregression")
model_train(DecisionTreeClassifier(), "decisiontree")

Started training logisticregression.


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Started training decisiontree.


In [5]:
model_train(RandomForestClassifier(), "randomforest")

Started training randomforest.


In [7]:
model_train(SVC(kernel="linear"), "svc-linear")
model_train(SVC(kernel="poly"), "svc-polynomal")
model_train(SVC(kernel="rbf"), "svc-rbf")
model_train(SVC(kernel="sigmoid"), "svc-sigmoid")

Started training svc-linear.
Started training svc-polynomal.
Started training svc-rbf.
Started training svc-sigmoid.
