In [1]:
# Importing useful libraries

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Loading data sets into data frames

train_df = pd.read_csv('data/train_data.txt', sep=' ::: ', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python')
test_df = pd.read_csv('data/test_data.txt', sep=' ::: ', names=['ID', 'TITLE', 'DESCRIPTION'], engine='python')
test_solution_df = pd.read_csv('data/test_data_solution.txt', sep=' ::: ', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'], engine='python')

In [3]:
# Sorting train and test data sets

x_train = train_df['DESCRIPTION']
x_test = test_df['DESCRIPTION']
y_train = train_df['GENRE']
y_test = test_solution_df['GENRE']

In [4]:
# Vectorizing text data

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [5]:
# Models for prediction

classifiers = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

In [6]:
# GridSearchCV parameters

parameters_cv = {
    'Naive Bayes': {
        'alpha': [0.001, 0.01, 0.1, 1.0]
    },
    'Logistic Regression': {
        'C': [0.01, 0.1, 1]
    }
}

In [7]:
# Training and evaluation of best model and parameter

results = {}

for name, model in classifiers.items():
    grid_search = GridSearchCV(model, parameters_cv[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(x_train_tfidf, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(x_test_tfidf)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results[name] = {
        'Best Params': grid_search.best_params_,
        'Accuracy': accuracy,
        'F1 Score': f1
    }

In [8]:
# Printing best model with its accuracy

for name, result in results.items():
    print(f"Classifier: {name}")
    print(f"Best Params: {result['Best Params']}")
    print(f"Accuracy: {result['Accuracy']:.4f}")
    print(f"F1 Score: {result['F1 Score']:.4f}")
    print("-" * 30)

Classifier: Naive Bayes
Best Params: {'alpha': 0.1}
Accuracy: 0.5454
F1 Score: 0.4873
------------------------------
Classifier: Logistic Regression
Best Params: {'C': 1}
Accuracy: 0.5839
F1 Score: 0.5453
------------------------------
