### Prepare Data

In [None]:
import os
import random
import re
import sys
import xml.etree.ElementTree
import yaml

In [None]:
def process_posts(input_lines, target_tag, split):
    num = 1
    train_csv_path = os.path.join("data", "prepared", "train.tsv")
    test_csv_path = os.path.join("data", "prepared", "test.tsv")
    os.makedirs(os.path.join("data", "prepared"), exist_ok=True)

    train = open(train_csv_path, "w", encoding="utf-8")
    test = open(test_csv_path, "w", encoding="utf-8")

    for line in input_lines:
        try:
            fd_out = train if random.random() > split else test
            attr = xml.etree.ElementTree.fromstring(line).attrib

            pid = attr.get("Id", "")
            label = 1 if target_tag in attr.get("Tags", "") else 0
            title = re.sub(r"\s+", " ", attr.get("Title", "")).strip()
            body = re.sub(r"\s+", " ", attr.get("Body", "")).strip()
            text = title + " " + body

            fd_out.write("{}\t{}\t{}\n".format(pid, label, text))

            num += 1
        except Exception as ex:
            sys.stderr.write(f"Skipping the broken line {num}: {ex}\n")
    train.close()
    test.close()

In [None]:
input = "data/data.xml"
input_lines = []
with open(input) as fd_in:
    input_lines = fd_in.readlines()
split = 0.20
seed = 47
process_posts(input_lines=input_lines, target_tag="<r>", split=split,)


### Feature Engineering

In [None]:
import os
import pickle
import sys

import numpy as np
import pandas as pd
import scipy.sparse as sparse
import yaml
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [None]:
def get_df(data):
    """Read the input data file and return a data frame."""
    df = pd.read_csv(data, encoding="utf-8", header=None, delimiter="\t", names=["id", "label", "text"])
    sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
    return df

In [None]:
def generate_and_save_train_features(bag_of_words, tfidf):
    """
    Generate train feature matrix. """
    
    df_train = get_df("data/prepared/train.tsv")
    train_words = np.array(df_train.text.str.lower().values)

   
    bag_of_words.fit(train_words)

    train_words_binary_matrix = bag_of_words.transform(train_words)
    feature_names = bag_of_words.get_feature_names_out()

    tfidf.fit(train_words_binary_matrix)
    train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
    
    out_path ="data/features"
    os.makedirs(out_path, exist_ok=True)
    train_output = os.path.join("data/features/train.pkl")
    save_matrix(df_train, train_words_tfidf_matrix, feature_names, train_output)

In [None]:
def generate_and_save_test_features(bag_of_words, tfidf):
    """
    Generate test feature matrix.
    
    """
    df_test = get_df("data/prepared/test.tsv")
    test_words = np.array(df_test.text.str.lower().values)

    test_words_binary_matrix = bag_of_words.transform(test_words)
    test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
    feature_names = bag_of_words.get_feature_names_out()

    out_path ="data/features"
    os.makedirs(out_path, exist_ok=True)
    test_output = os.path.join("data/features/test.pkl")
    save_matrix(df_test, test_words_tfidf_matrix, feature_names, test_output)

In [None]:
def save_matrix(df, matrix, names, output):
    """
    Save the matrix to a pickle file.

    """
    id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
    label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T

    result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")

    msg = "The output matrix {} size is {} and data type is {}\n"
    sys.stderr.write(msg.format(output, result.shape, result.dtype))
    
    with open(output, "wb") as fd:
        pickle.dump((result, names), fd)
    pass

In [None]:
np.set_printoptions(suppress=True)

max_features = 100
ngrams = 1

bag_of_words = CountVectorizer(
    stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
)
tfidf = TfidfTransformer(smooth_idf=False)

generate_and_save_train_features(bag_of_words, tfidf)

generate_and_save_test_features(bag_of_words, tfidf)


### Model Training

In [None]:
import os
import pickle
import sys
import numpy as np
import yaml
from sklearn.ensemble import RandomForestClassifier

In [None]:
def train():
    """
    Train a random forest classifier.
    
    Returns:
        sklearn.ensemble.RandomForestClassifier: Trained classifier.
    """

    input = "data/features" 

    
    seed = 47 
    n_est = 100
    min_split = 0.01
    
    # Load the data
    with open(os.path.join(input, "train.pkl"), "rb") as fd:
        matrix, _ = pickle.load(fd)
    
    labels = np.squeeze(matrix[:, 1].toarray())
    x = matrix[:, 2:]

    sys.stderr.write("Input matrix size {}\n".format(matrix.shape))
    sys.stderr.write("X matrix size {}\n".format(x.shape))
    sys.stderr.write("Y matrix size {}\n".format(labels.shape))

    clf = RandomForestClassifier(
        n_estimators=n_est, min_samples_split=min_split, n_jobs=2, random_state=seed
    )

    clf.fit(x, labels)
    
    return clf

In [None]:
clf = train()

# Save the model
output = "model.pkl"
with open(output, "wb") as fd:
    pickle.dump(clf, fd)


### Evaluate

In [None]:
import json
import math
import os
import pickle
import sys

import pandas as pd
from sklearn import metrics
from sklearn import tree
from dvclive import Live
from matplotlib import pyplot as plt

In [None]:
def evaluate(model, matrix, split, save_path):
    """
    Dump all evaluation metrics and plots for given datasets.

    """
    labels = matrix[:, 1].toarray().astype(int)
    x = matrix[:, 2:]
    sys.stderr.write("Input matrix size {}\n".format(x.shape))
    predictions_by_class = model.predict_proba(x)
    predictions = predictions_by_class[:, 1]

In [None]:
def save_importance_plot( model, feature_names):
    """
    Save feature importance plot.

    """
    fig, axes = plt.subplots(dpi=100)
    fig.subplots_adjust(bottom=0.2, top=0.95)
    axes.set_ylabel("Mean decrease in impurity")

    importances = model.feature_importances_
    forest_importances = pd.Series(importances, index=feature_names).nlargest(n=30)
    forest_importances.plot.bar(ax=axes)


In [None]:

EVAL_PATH = "eval"

model_file = "model.pkl"
feature_path = "data/features"

# Load model and data.
with open(model_file, "rb") as fd:
    model = pickle.load(fd)

with open(os.path.join(feature_path, "train.pkl"), "rb") as fd:
    train, feature_names = pickle.load(fd)

with open(os.path.join(feature_path, "test.pkl"), "rb") as fd:
    test, feature_names = pickle.load(fd)

# Evaluate train and test datasets.

evaluate(model, train, "train", save_path=EVAL_PATH)
evaluate(model, test, "test", save_path=EVAL_PATH)

# Dump feature importance plot.
save_importance_plot(model, feature_names)

