# Logistic Regression for Text Classification

**Imports**

In [1]:
import os
import glob
import time
import pickle
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from typing import Iterator, Tuple, List
from iterator_utils import chain_unique


For this task we can reuse the work we did for Naive Bayes, that dealt with the
construction of features. For Logistic Regression we'll use the implementation provided
by the `sklearn` library and the `SGDClassifier` class, which contains various linear model
training implementations.

Up to this point, we have a feature representation per type of document, however we are
required to have a complete matrix with the vocabulary from all files as columns, and all
the documents as rows. This would result in a 18828 x 80086 matrix, which is a very memory
demanding data structure.

Instead of building one matrix, we can again build a matrix per class, but each matrix with
the complete vocabulary (and the target of each document). We can then train the model using
the `partial_fit` function from `sklearn`, that calculates a batch gradient descent over the
given matrix with partial examples.

The following function build the said set of matrices from the already constructed bag of
words

In [2]:
enc = "utf-8"
newsgroups_path = "../data/20news-18828"
vocab_size_path = f"{newsgroups_path}/20N_vocab_size"
newsgroups_all_files = glob.glob(f"{newsgroups_path}/*")
newsgroups_dir_paths = [
    dir for dir in newsgroups_all_files if os.path.isdir(dir)
]
classes = [dir.split("/")[-1] for dir in newsgroups_dir_paths]
class_ids = [i for i, _ in enumerate(classes)]
dataframe_cols = ["doc", "text", "class"]
print(classes)

vocab_size_file = open(vocab_size_path)
V = int(vocab_size_file.readline())
vocab_size_file.close()
docs_data_filepaths = [
    f"{dir_path}/{dir_name}.csv" for dir_path, dir_name in zip(newsgroups_dir_paths, classes)
]
bow_filepaths = [
    f"{dir_path}/{dir_name}_bow.csv" for dir_path, dir_name in zip(newsgroups_dir_paths, classes)
]
binbow_filepaths = [
    f"{dir_path}/{dir_name}_binbow.csv" for dir_path, dir_name in zip(newsgroups_dir_paths, classes)
]
features_filepaths = [
    f"{dir_path}/{dir_name}_features.csv" for dir_path, dir_name in zip(newsgroups_dir_paths, classes)
]

def get_vocabulary (bows):
    return set(chain_unique(bow.columns for bow in bows))

def save_complete_data (bows: List[pd.DataFrame]):
    vocab = get_vocabulary(bows)
    for bow, file_path, c in zip(bows, features_filepaths, class_ids):
        missing_words = vocab.difference(set(bow.columns))
        missing_data = pd.DataFrame(0, index=bow.index, columns=missing_words)
        class_data = pd.DataFrame(c, index=bow.index, columns=["class_id"])
        print(missing_data)
        complete_data = pd.concat([bow, missing_data, class_data], axis=1)
        print(
            len(vocab),
            len(bow.columns),
            len(missing_words),
            len(bow.columns) + len(missing_words)
        )
        print(complete_data)
        complete_data.to_csv(file_path)

['sci.crypt', 'misc.forsale', 'sci.med', 'rec.sport.hockey', 'alt.atheism', 'comp.sys.mac.hardware', 'comp.os.ms-windows.misc', 'talk.politics.mideast', 'soc.religion.christian', 'talk.politics.misc', 'talk.politics.guns', 'rec.motorcycles', 'comp.windows.x', 'comp.graphics', 'rec.sport.baseball', 'comp.sys.ibm.pc.hardware', 'sci.electronics', 'sci.space', 'rec.autos', 'talk.religion.misc']


As with Naive Bayes, we have the same partition functions

In [3]:
DataSplit = Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]

def partition_docs (docs: np.ndarray):
    n = len(docs)
    train_size = int(round(0.6*n))
    test_size = int(round(0.3*n))
    train_docs = docs[:train_size]
    test_docs = docs[train_size:train_size + test_size]
    dev_docs = docs[train_size + test_size:]
    return train_docs, test_docs, dev_docs

def partition_data (data_set: Iterator[pd.DataFrame]) -> Iterator[DataSplit]:
    for data in data_set:
        docs = data.index.to_numpy()
        train_docs, test_docs, dev_docs = partition_docs(docs)
        train = data.iloc[train_docs]
        test = data.iloc[test_docs]
        dev = data.iloc[dev_docs]
        yield train, test, dev

We then proceed to train the model using `partial_fit`. In order to avoid
repeating the training process, we can save the parameters of the model
using `pickle`. We also save the dataframes that are going to be used
for training and development. Note that since the partition functions
are generators, we don't load everything into memory avoiding overflow
erros. Each of the 20 groups of dataframes is held in memory only once in this
case, in contrast to the Naive Bayes excersice

In [4]:
def train_and_save (
        classifier,
        data_set: Iterator[pd.DataFrame],
        model_filepath: str,
        test_filepaths: List[str],
        dev_filepaths: List[str]
):
    for (train, test, dev), test_filepath, dev_filepath \
            in zip(partition_data(data_set), test_filepaths, dev_filepaths):
        print("Training...")
        t0 = time.time()
        X = train.iloc[:,:-1]
        y = train.class_id
        print(X, y)
        classifier.partial_fit(X, y, classes=class_ids)
        t1 = time.time()
        print(f"Finished batch in {t1 - t0}")
        print("Saving test and dev set to:")
        print(test_filepath)
        print(dev_filepath)
        test.to_csv(test_filepath)
        dev.to_csv(dev_filepath)
    with open(model_filepath, "wb") as model_file:
        pickle.dump(classifier, model_file)

We use the trained model on the test dataframes

In [None]:
newsgroups_processing_path = "../data/20N_processing"
test_dir_path = f"{newsgroups_processing_path}/20N_test"
dev_dir_path = f"{newsgroups_processing_path}/20N_dev"
models_dir_path = f"{newsgroups_processing_path}/20N_models"
test_filepaths = [f"{test_dir_path}/{c}_test.csv" for c in classes]
dev_filepahts = [f"{dev_dir_path}/{c}_dev.csv" for c in classes]
lr_model_path = f"{models_dir_path}/20N_LR_model.pkl"
lr_classifier = SGDClassifier(loss="log")

precision_sum = 0
recall_sum = 0
f1_sum = 0
with open(lr_model_path, "rb") as model_file:
    clf = pickle.load(model_file)
    for test_path in test_filepaths:
        test_data = pd.read_csv(test_path, index_col=0)
        X_test = test_data.iloc[:,:-1]
        print(X_test)
        y_expected = test_data.class_id
        y_predicted = clf.predict(X_test)
        print(y_expected)
        print(y_predicted)
        precision = precision_score(y_expected, y_predicted, average="macro")
        recall = recall_score(y_expected, y_predicted, average="macro")
        f1 = f1_score(y_expected, y_predicted, average="macro")
        precision_sum += precision
        recall_sum += recall
        f1_sum += f1
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1: {f1}")

num_tests = len(test_filepaths)
precision_avg = precision_sum / num_tests
recall_avg = recall_sum / num_tests
f1_avg = f1_sum / num_tests

print(f"Average precision: {precision_avg}")
print(f"Average recall: {recall_avg}")
print(f"Average F1: {f1_avg}")

     ef  text  white  hous  announc  q  as  clipper  chip  encrypt  ...  \
373   0     0      0     0        0  0   0        1     0        6  ...   
911   0     0      0     0        0  0   0        1     1        1  ...   
767   0     0      0     0        0  0   0        3     4        0  ...   
621   0     0      0     0        1  0   0        1     0        0  ...   
869   0     0      0     0        0  0   0        1     1        0  ...   
..   ..   ...    ...   ...      ... ..  ..      ...   ...      ...  ...   
287   0     0      0     0        0  0   0        2     1        0  ...   
524   0     0      0     0        0  0   1        0     0        1  ...   
279   0     0      0     0        0  0   0        0     0        0  ...   
875   0     0      0     0        0  0   0        6     9        6  ...   
727   0     1      1     1        2  1   2        3     2        4  ...   

     mlist  monarchian  dorman  compoment  castl  beleif  wsqmc  hmb  \
373      0           0     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
