In [None]:
# !pip install scikit-multilearn

In [None]:
# !pip install xgboost

In [None]:
# !pip install jsonlines

In [None]:
import pandas as pd
import numpy as np
import sys
import xgboost as xgb
import time
import jsonlines
import json
import regex as re
from bs4 import BeautifulSoup

# ML modules and classes
from skmultilearn.problem_transform import BinaryRelevance, LabelPowerset, ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# NLP modules and classes
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Display options
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_colwidth', None)

In [None]:
# Import data
col_names = [
    'id',
    'title',
    'body',
    'tags'
]

df_raw = pd.read_csv(r'./data/training_data.tsv',
                      sep = '\t',
                      names = col_names)

In [None]:
df = df_raw.head(n = 40000)
df.head()

In [None]:
def remove_html_code_tags(input_string):
  # Remove code between <code> and </code> tags using regular expressions
  pattern = re.compile(r'<code>.*?</code>', flags=re.DOTALL)
  text = re.sub(pattern, '', input_string)

  # Remove HTML tags using Beautiful Soup
  soup = BeautifulSoup(text, 'lxml')
  cleaned_text = soup.get_text()

    

  return cleaned_text


# Remove html tags and code from 'body' column
df['body_no_html'] = df['body'].apply(remove_html_code_tags)
df = df.drop('body', axis = 1)

df.head()

In [None]:
def preprocess_text(text):
    # Normalization
    text = text.lower()
    
    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stop words removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Return the preprocessed text as a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply preprocessing pipeline to the 'text' column
df['text'] = df['title'] + '. ' + df['body_no_html']
df['preprocessed_text'] = df['text'].apply(preprocess_text)
df.drop(['body_no_html', 'title', 'text'], axis = 1, inplace = True)

df.head()

In [None]:
# Preprocess target variable
df['tags_list'] = df['tags'].apply(lambda x: x.split(','))

# Perform multi-label binarization
mlb = MultiLabelBinarizer()
tag_matrix = mlb.fit_transform(df['tags_list'])
# Create a new DataFrame with the binary columns for each tag
tag_df = pd.DataFrame(tag_matrix, columns=mlb.classes_)
df = pd.concat([df, tag_df], axis=1).drop(['id', 'tags', 'tags_list'], axis = 1)
df

In [None]:
# Split the data into train and test sets
X = df['preprocessed_text'].to_numpy()
y = df.drop(['preprocessed_text'], axis = 1).to_numpy()

# Apply TF-IDF to the preprocessed text
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)#.toarray()

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
def modified_f1_score(y_test, y_pred):
    # Initialize arrays
    pi = np.empty(y_test.shape[0])
    yi = np.empty(y_test.shape[0])
    piyi = np.empty(y_test.shape[0])

    # Construct |pi|, |yi| and |piyi| arrays
    nrows = y_test.shape[0]
    for i in range(nrows):
        y_pred_non_zero_idx = np.nonzero(y_pred[i])[0]
        y_test_non_zero_idx = np.nonzero(y_test[i])[0]
        intersection_row = np.intersect1d(y_pred_non_zero_idx, y_test_non_zero_idx)

        pi[i] = y_pred_non_zero_idx.shape[0]

        yi[i] = y_test_non_zero_idx.shape[0]
        piyi[i] = intersection_row.shape[0]

    # Get f1i array
    f1i = 2*piyi / (pi + yi)
    # Return the modified f1 score
    return np.mean(f1i)


def results_to_file(writer, d):
    json_string = json.dumps(d)
    writer.write(json_string + '\n')

results_file_name = 'results.json'

### Method 1: Transform to multiclass

In [None]:
def build_model(model, mlb_estimator, X_train, y_train, X_test, y_test, model_type):
    # Define classifier
    clf = mlb_estimator(model)

    mlb_name = clf.__class__.__name__
    model_name = clf.__dict__['classifier'].__class__.__name__
    n_points = X_train.shape[0]

    # Fit model
    start_time = time.time()
    clf_model = clf.fit(X_train, y_train)
    train_time = time.time() - start_time

    # Get predictions
    start_time = time.time()
    y_pred = clf_model.predict(X_test).toarray()
    pred_time = time.time() - start_time

    # Get transformed y_test and y_pred for Label Powerset method
    # as well as evaluation
    if mlb_name == 'LabelPowerset':
        y_pred = clf_model.transform(y_pred)
        y_test = clf_model.transform(y_test)

        # Calculate micro F1 score
        micro_f1 = f1_score(y_test, y_pred, average='micro')
        # Calculate macro F1 score
        macro_f1 = f1_score(y_test, y_pred, average='macro')
        result = {
            "micro_f1": micro_f1,
            "macro_f1": macro_f1,
            "train_time": train_time,
            "prediction_time": pred_time
            }
    else:
        modified_f1 = modified_f1_score(y_test, y_pred)
        result = {
            "modified_f1": modified_f1,
            "train_time": train_time,
            "prediction_time": pred_time
            }
    
    # Get complete resutl dict
    result_dict = {
        mlb_name: {
            'classifier': model_name,
            'type': model_type,
            'train_points': n_points,
            'results': result
        }
    }

    return result_dict

In [None]:
# Define models and problem transformation methods
problem_transormation_methods = [
    BinaryRelevance,
    LabelPowerset,
    ClassifierChain
]

multiclass_models = [
    RandomForestClassifier(),
    LogisticRegression(multi_class = 'multinomial')
]
one_vs_rest_models = [
    LogisticRegression(multi_class = 'ovr')
]
models = multiclass_models + [""] + one_vs_rest_models


binary_models = [
    RandomForestClassifier(),
    LogisticRegression(),
    xgb.XGBClassifier()

]

# Train, evaluate models and export results to file
with open(results_file_name,'a') as f:
    for mlb in problem_transormation_methods:
        if mlb == LabelPowerset:
            model_type = 'multiclass'
            for model in models:
                if model == '':
                    model_type = 'one-vs-rest'
                    continue
                clf_results = build_model(model, mlb, X_train, y_train, X_test, y_test, model_type)
                print(clf_results)
                print('-'*200)
                results_to_file(f, clf_results)
        else:
            model_type = 'binary'
            for model in binary_models:
                clf_results = build_model(model, mlb, X_train, y_train, X_test, y_test, model_type)
                print(clf_results)
                print('-'*200)
                results_to_file(f, clf_results)
        

### Method 2: Approximate Nearest Neighbours

In [None]:
from sklearn.neighbors import NearestNeighbors

def my_predict(neighbours_mat, y_train, freq_threshold = 0.5):
    y_pred = np.empty(y_train.shape)
    for i in range(neighbours_mat.shape[0]):
        pIdxs = neighbours_mat[i]
        tags = []
        for idx in pIdxs:
            tags.append(y_train[idx])
        
        tag_frequencies = np.mean(tags, axis = 0)
        idx_above_threshold = np.where(tag_frequencies > freq_threshold)[0]
        prediction = np.zeros(4)
        prediction[idx_above_threshold] = 1
        
        y_pred[i] = prediction
    return np.around(y_pred)


def build_knn(k, freq_thres, X_train, X_test, y_train, y_test):
    knn = NearestNeighbors(
        n_neighbors=k
    )

    # Train model
    start_time = time.time()
    model = knn.fit(X_train)
    train_time = time.time() - start_time

    # Get predictions
    start_time = time.time()
    neighbours_mat = model.kneighbors(
        X_test, 
        return_distance=False
    )
    y_pred = my_predict(neighbours_mat, y_train, freq_threshold = freq_thres)
    pred_time = time.time() - start_time

    modified_f1 = modified_f1_score(y_test, y_pred)
    result = {
        "modified_f1": modified_f1,
        "train_time": train_time,
        "prediction_time": pred_time
        }

    model_name = model.__class__.__name__
    n_points = X_train.shape[0]


    clf_results = {
        'classifier': model_name,
        'train_points': n_points,
        'k': k,
        'frequncy_threshold': freq_thres,
        'results': result
    }

    return clf_results

In [None]:
# Run experiments
freq_thres_vals = [0.1, 0.3, 0.5, 0.7, 0.9]
with open(results_file_name,'a') as f:
    for k in range(2, 11):
        for freq_thres in freq_thres_vals:
            clf_results = build_knn(k, freq_thres, X_train, X_test, y_train, y_test)
            print(clf_results)
            print('-'*200)
            results_to_file(f, clf_results)

### Method 3: Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances

def my_predict(neighbours_mat, y_train, freq_threshold = 0.5):
    y_pred = np.empty(y_train.shape)
    for i in range(neighbours_mat.shape[0]):
        pIdxs = neighbours_mat[i]
        tags = []
        for idx in pIdxs:
            tags.append(y_train[idx])
        
        tag_frequencies = np.mean(tags, axis = 0)
        idx_above_threshold = np.where(tag_frequencies > freq_threshold)[0]
        prediction = np.zeros(4)
        prediction[idx_above_threshold] = 1
        
        y_pred[i] = prediction
    return np.around(y_pred)


def find_closest_centroids(X_test, centroids):
    distances = pairwise_distances(X=X_test, Y=centroids, metric='euclidean')
    closest_indices = np.argmin(distances, axis=1).astype(int)
    return closest_indices


def build_kmeans_freq(k, freq_threshold, X_train, y_train, X_test, y_test):
    n_points = X_train.shape[0]

    # Define KMeans model
    kmeans = KMeans(
        n_clusters = k, 
        random_state=42
    )

    # Fit KMeans model
    start_time = time.time()
    kmeans.fit(X_train)
    train_time = time.time() - start_time
    centroids  = kmeans.cluster_centers_
    labels_arr = kmeans.labels_

    # Get predictions for each cluster
    # A cluster prediction consists of all the tags in the cluster that 
    # appear with frequency more than freq_threshold
    start_time = time.time()
    cluster_predictions = []
    for cluster_label in range(k):
        cluster_arr_idx = np.where(labels_arr == cluster_label)[0]

        # Get current cluster data
        # X_cluster = X_train[cluster_arr_idx]
        y_cluster = y_train[cluster_arr_idx]

        # Frequency of each tag in the cluster
        tag_frequencies = np.mean(y_cluster, axis = 0)
        # Get prediction for each cluster
        # i.e. get tags with frequency more than freq_threshold
        idx_above_threshold = np.where(tag_frequencies > freq_threshold)[0]
        cluster_prediction = np.zeros(4)
        cluster_prediction[idx_above_threshold] = 1
        cluster_prediction = np.around(cluster_prediction, 2)
        cluster_predictions.append(cluster_prediction)

    # Convert to array
    # cluster_predictions_arr[5] gives the prediction for the 5th cluster
    cluster_predictions_arr = np.array(cluster_predictions) 

    # Find closest centroid for each point in the test set
    closest_centroid_idx = find_closest_centroids(X_test, centroids)
    # Get predictions for each point in the test set
    y_pred = cluster_predictions_arr[closest_centroid_idx]
    pred_time = time.time()

    # Get modified f1 score
    modified_f1 = modified_f1_score(y_test, y_pred)

    result = {
        "modified_f1": modified_f1,
        "train_time": train_time,
        "prediction_time": pred_time
        }


    clf_results = {
        'classifier': 'KmeansFreq',
        'train_points': n_points,
        'num_clusters': k,
        'frequncy_threshold': freq_thres,
        'results': result
    }

    return clf_results

In [None]:
# Run experiments
import warnings
warnings.filterwarnings("ignore")

freq_thres_vals = [0.1, 0.3, 0.5, 0.7, 0.9]
with open(results_file_name,'a') as f:
    for k in range(10, 21, 5):
        for freq_thres in freq_thres_vals:
            clf_results = build_kmeans_freq(k, freq_thres, X_train, y_train, X_test, y_test)
            print(clf_results)
            print('-'*200)
            results_to_file(f, clf_results)