# Baseline Models

- Using three baseline models for the development of an analysis that detects political language in court documents.
    1. Logistic regression
    2. Naive Bayes
    3. SVM
- Using baselines as a means for testing model performance and building an accurate, model for classifying language used in Bulgaria's constitutional court

## Next steps

- Data needs more sentences labelled as political, as data is imbalanced and models observe few political sentences
- Need to optimize hyperparameters to improve model performance

In [1]:
import glob
import os
import pickle
import re
from pathlib import Path

import nltk
import pandas as pd

nltk.download("stopwords")

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn import naive_bayes, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/paulj1989/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load all json files in data directory
files = glob.glob("data/json/*.json")
data = []

# for loop for processing files and adding doc id
for json in files:
    frame = pd.read_json(json)
    # get file name as string
    # create column identifying dfs as doc_id
    # split string (remove .json from file name)
    frame["doc_id"] = os.path.splitext(os.path.basename(json))[0]
    data.append(frame)

# concatenate all data frames
df = pd.concat(data, ignore_index=True)


## Cleaning Text

In [None]:
# create binary variable where POLITICAL = 1, all else = 0
df.loc[df["label_id"] != 4, "label_id"] = 0

df.loc[df["label_id"] == 4, "label_id"] = 1


In [None]:
def preprocessing(text):

    text = re.sub('<[^>]*>', '', text)
    text = re.sub(r'[^\w\s]','', text)
    stop_words = set(stopwords.words("english"))
    words = [word for word in text.lower().split() if not word in stop_words]
    text = " ".join(words)

    return text

In [None]:
df['text'] = df['text'].apply(preprocessing)

In [None]:
# pd.set_option('display.max_rows', 371)
df

In [None]:
ps = PorterStemmer()

def token_ps(text):
    return [ps.stem(word) for word in text.split()]

## Logistic Regression

- Computing a logistic regression model based on the values created from a vectorizer algorithm called tf-idf, which stands for term-frequency inverse document frequency.
- tf-idf measures the originality of the word by comparing how often it appears in a doc with the number of docs the word appears in. The frequency of the words in a doc (compared against other docs) measures the importance of that word in the wider corpus.
- The logistic regression below is computed by building a vector of word values based on the iportance of each word, before using the word vectors to identify the characteristics of the political label to predict which sentences will be political.

In [None]:
# transforming text into vectors
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)
# compute tfidf values for all words in 'text' column of df
X = tfidf.fit_transform(df['text'])
y = df.label_id.values

In [None]:
# splitting data into train and test splits in order to test predictive accuracy
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, test_size=0.3, shuffle=True
)

# computes and then fits logistic regression that implements cross-validation as a part of the process
# cv = number of cross validation folds
log_reg = LogisticRegressionCV(
    cv=10, scoring="accuracy", n_jobs=-1, verbose=3, max_iter=500
).fit(X_train, y_train)

# model accuracy
log_predictions = log_reg.predict(X_test)


In [None]:
# defining a function that prints model prediction accuracy
def model_accuracy(name, preds):
    print("---{} Test Set Results---".format(name))
    print("Weighted F1 Average: {}".format(f1_score(y_test, preds, average="weighted")))
    # precision = % predicted accurately
    # recall = % positives identified
    # f1-score = weighted harmonic mean of precision & recall
    # weighted f-1 avg used for comparing classification models
    print(classification_report(y_test, preds))

In [None]:
model_accuracy("Logit", log_predictions)

## Naive Bayes

In [None]:
# compute tfidf values for all words in 'text' column of df
# .toarray() added in this instance to adjus the way the data is structured
# for nb model to run without error
X = tfidf.fit_transform(df["text"]).toarray()
y = df.label_id.values

In [None]:
# splitting data into train and test splits in order to test predictive accuracy
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, test_size=0.3, shuffle=True
)

In [None]:
# fit the training dataset on the NB classifier
nb = naive_bayes.MultinomialNB()
nb.fit(X_train, y_train)

# model accuracy
nb_predictions = nb.predict(X_test)
model_accuracy("Naive Bayes", nb_predictions)

## Support Vector Machines (SVM)

In [None]:
# fit the training dataset on the SVM classifier
SVM = svm.SVC(C=1.0, kernel="linear", degree=3, gamma="auto")
SVM.fit(X_train, y_train)

# model accuracy
svm_predictions = SVM.predict(X_test)
model_accuracy("SVM", svm_predictions)

## Pickling Models (for Future Use)

In [None]:
# saving tfidf
pickle.dump(tfidf, open('tfidf.pickle', 'wb'))

# saving models
pickle.dump(log_reg, open('log_reg.pickle', 'wb'))
pickle.dump(nb, open('nb.pickle', 'wb'))
pickle.dump(SVM, open('svm.pickle', 'wb'))