In [18]:
from pathlib import Path  # https://realpython.com/python-pathlib/
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
parent_path = Path.cwd().parent
train_path = parent_path.joinpath("dataset", "formatted_train.csv")
test_path = parent_path.joinpath("dataset", "formatted_test.csv")

train_df_total = pd.read_csv(train_path)
test_df_total = pd.read_csv(test_path)

In [25]:
# Create smaller df
train_df = train_df_total.head(10000)
test_df = test_df_total.head(10000)

In [26]:
def create_count_vector(df: pd.DataFrame) -> CountVectorizer:
    text = df["body"]  # Figure out later how we handle titles
    cv = CountVectorizer()
    cv.fit(text)
    return cv

In [27]:
def format_df_to_bow(cv: CountVectorizer, df: pd.DataFrame) -> (pd.Series, pd.Series):
    """
    Function to format text to a bag of words (BoW).
    The following code is primarily from https://www.kaggle.com/code/kashnitsky/topic-4-linear-models-part-4-pros-cons
    :param df: dataframe with data
    :return: tuple of x (words/feature) and y (label)
    """
    text = df["body"]  # Figure out later how we handle titles
    x = cv.transform(text)
    y = df["sentiment"]

    return x, y

In [28]:
cv = create_count_vector(train_df)

In [29]:
x_train, y_train = format_df_to_bow(cv, train_df)
x_test, y_test = format_df_to_bow(cv, test_df)

In [30]:
bow_log_reg = LogisticRegression(solver="lbfgs", n_jobs=-1, random_state=7).fit(x_train, y_train)

In [31]:
print("Train accuracy: ", round(bow_log_reg.score(x_train, y_train), 3))
print("Test Accuracy: ", round(bow_log_reg.score(x_test, y_test), 3))

Train accuracy:  0.99
Test Accuracy:  0.816
