In [8]:
import pandas as pd
import numpy as np
import json

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import warnings; warnings.simplefilter('ignore')  # Ignore warning on model fitting.

# Data preparation

In [2]:
config_file = '../references/config.json'
with open(config_file, "r") as f:
    config = json.load(f)

# Extract values from the configuration file
texts_data_path = config["data_paths"]["texts"]
embeddings_data_path = config["data_paths"]["embeddings"]
full_df_data_path = config["data_paths"]["full_df"]

In [3]:
df = pd.read_csv(full_df_data_path, index_col=0)
features = df.iloc[:, :-1].values
labels = df.iloc[:, -1].values
features.shape, labels.shape

((2237, 1068), (2237,))

# Logistic regression pipeline

In [4]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.3)

In [7]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.6408945686900959


In [10]:
target_names = ['negative', 'positive', 'neutral']  # name of each class
lr_clf = LogisticRegression().fit(train_features, train_labels)
print('-'*15, 'Test classification_report', '-'*15, '\n')
print(classification_report(test_labels, lr_clf.predict(test_features), target_names=target_names))

--------------- Test classification_report --------------- 

              precision    recall  f1-score   support

    negative       0.70      0.66      0.68       228
    positive       0.67      0.71      0.69       268
     neutral       0.45      0.44      0.45       176

    accuracy                           0.62       672
   macro avg       0.61      0.60      0.61       672
weighted avg       0.62      0.62      0.62       672

