The-Strategy-Unit · yiwen-h · Jun 8, 2023 · Jun 5, 2023 · Jun 5, 2023 · Jun 5, 2023
diff --git a/coverage.xml b/coverage.xml
diff --git a/current_best_multilabel/bert.txt b/current_best_multilabel/bert.txt
diff --git a/current_best_multilabel/svc_merged.sav b/current_best_multilabel/svc_merged.sav
diff --git a/current_best_multilabel/svc_merged.txt b/current_best_multilabel/svc_merged.txt
diff --git a/current_best_multilabel/svc_merged_perf.xlsx b/current_best_multilabel/svc_merged_perf.xlsx
diff --git a/pxtextmining/factories/factory_data_load_and_split.py b/pxtextmining/factories/factory_data_load_and_split.py
@@ -66,7 +66,7 @@ def bert_data_to_dataset(
             tokenizer(
                 list(X["FFT answer"]),
                 truncation=True,
-                padding=True,
+                padding='max_length',
                 max_length=max_length,
                 return_tensors="tf",
             )
@@ -76,7 +76,7 @@ def bert_data_to_dataset(
             tokenizer(
                 list(X),
                 truncation=True,
-                padding=True,
+                padding='max_length',
                 max_length=max_length,
                 return_tensors="tf",
             )
@@ -179,7 +179,7 @@ def onehot(df, col_to_onehot):
     Returns:
         (pd.DataFrame): One-hot encoded data
     """
-    encoder = OneHotEncoder(sparse=False)
+    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
     col_encoded = encoder.fit_transform(df[[col_to_onehot]])
     return col_encoded
 

diff --git a/pxtextmining/factories/factory_predict_unlabelled_text.py b/pxtextmining/factories/factory_predict_unlabelled_text.py
@@ -4,6 +4,7 @@
 from pxtextmining.factories.factory_data_load_and_split import (
     bert_data_to_dataset,
     remove_punc_and_nums,
+    clean_empty_features
 )
 from pxtextmining.params import minor_cats
 
@@ -78,6 +79,48 @@ def predict_multilabel_sklearn(
     preds_df["labels"] = preds_df.apply(get_labels, args=(labels,), axis=1)
     return preds_df
 
+def predict_multilabel_bert(
+    data,
+    model,
+    labels = minor_cats,
+    additional_features = False,
+    label_fix = True
+):
+    """Conducts basic preprocessing to remove blank text.
+    Utilises a pretrained transformer-based machine learning model to make multilabel predictions on the cleaned text.
+    Also takes the class with the highest predicted probability as the predicted class in cases where no class has
+    been predicted, if fix_no_labels = True.
+
+    Args:
+        text (pd.Series OR pd.DataFrame): DataFrame or Series containing data to be processed and utilised for predictions. Must be DataFrame with columns 'FFT answer' and 'FFT_q_standardised' if additional_features = True
+        model (tf.Model): Trained tensorflow estimator able to perform multilabel classification.
+        labels (list, optional): List containing target labels. Defaults to major_cats.
+        additional_features (bool, optional): Whether or not FFT_q_standardised is included in data. Defaults to False.
+        label_fix (bool, optional): Whether or not the class with the highest probability is taken as the predicted class in cases where no classes are predicted. Defaults to True.
+
+    Returns:
+        (pd.DataFrame): DataFrame containing one hot encoded predictions, and a column with a list of the predicted labels.
+    """
+    if additional_features == False:
+        text = pd.Series(data)
+    else:
+        text = data['FFT answer']
+    processed_text = clean_empty_features(text)
+    if additional_features == False:
+        final_data = processed_text
+    else:
+        final_data = pd.merge(processed_text, data['FFT_q_standardised'], how='left', on='Comment ID')
+    y_probs = predict_with_bert(final_data, model, additional_features=additional_features,
+                                    already_encoded=False)
+    y_binary = turn_probs_into_binary(y_probs)
+    if label_fix == True:
+        predictions = fix_no_labels(y_binary, y_probs, model_type = 'bert')
+    else:
+        predictions = y_binary
+    preds_df = pd.DataFrame(predictions, index=processed_text.index, columns=labels)
+    preds_df["labels"] = preds_df.apply(get_labels, args=(labels,), axis=1)
+    return preds_df
+
 def predict_multiclass_bert(x, model, additional_features, already_encoded):
     """Makes multiclass predictions using a transformer-based model. Can encode the data if not already encoded.
 

diff --git a/pxtextmining/factories/factory_write_results.py b/pxtextmining/factories/factory_write_results.py
@@ -3,8 +3,14 @@
 import numpy as np
 import pandas as pd
 
-from pxtextmining.factories.factory_predict_unlabelled_text import get_labels, predict_multilabel_sklearn, predict_with_probs, get_probabilities
 from tensorflow.keras import Model, Sequential
+from pxtextmining.factories.factory_predict_unlabelled_text import (
+    get_labels,
+    predict_multilabel_sklearn,
+    predict_multilabel_bert,
+    get_probabilities,
+    predict_with_bert
+)
 from pxtextmining.factories.factory_model_performance import parse_metrics_file
 
 
@@ -24,15 +30,18 @@ def write_multilabel_models_and_metrics(models, model_metrics, path):
         if isinstance(models[i], (Sequential, Model)):
             models[i].save(fullpath)
         else:
-            modelpath = os.path.join(path, model_name + '.sav')
+            modelpath = os.path.join(path, model_name + ".sav")
             pickle.dump(models[i], open(modelpath, "wb"))
         # Write performance metrics file
-        txtpath = os.path.join(path, model_name + '.txt')
+        txtpath = os.path.join(path, model_name + ".txt")
         with open(txtpath, "w") as file:
             file.write(model_metrics[i])
     print(f"{len(models)} models have been written to {path}")
 
-def write_model_preds(x, y, model, labels, additional_features = True, path = 'labels.xlsx'):
+
+def write_model_preds(
+    x, y, model, labels, additional_features=True, path="labels.xlsx"
+):
     """Writes an Excel file to enable easier analysis of model outputs using the test set. Columns of the Excel file are: comment_id, actual_labels, predicted_labels, actual_label_probs, and predicted_label_probs.
 
     Currently only works with sklearn models.
@@ -45,27 +54,59 @@ def write_model_preds(x, y, model, labels, additional_features = True, path = 'l
         additional_features (bool, optional): Whether or not FFT_q_standardised is included in data. Defaults to True.
         path (str, optional): Filename for the outputted file. Defaults to 'labels.xlsx'.
     """
-    actual_labels = pd.DataFrame(y, columns = labels).apply(get_labels, args=(labels,), axis=1)
-    actual_labels.name = 'actual_labels'
-    predicted_labels = predict_multilabel_sklearn(x,
-                                                  model,
-                                                  labels=labels,
-                                                  additional_features = additional_features,
-                                                  label_fix = True,
-                                                  enhance_with_probs=True
-                                                ).reset_index()['labels']
-    predicted_labels.name = 'predicted_labels'
+    actual_labels = pd.DataFrame(y, columns=labels).apply(
+        get_labels, args=(labels,), axis=1
+    )
+    actual_labels.name = "actual_labels"
+    if isinstance(model, Model) == True:
+        predicted_labels = predict_multilabel_bert(
+            x,
+            model,
+            labels=labels,
+            additional_features=additional_features,
+            label_fix=True,
+        ).reset_index()["labels"]
+    else:
+        predicted_labels = predict_multilabel_sklearn(
+            x,
+            model,
+            labels=labels,
+            additional_features=additional_features,
+            label_fix=True,
+            enhance_with_probs=True,
+        ).reset_index()["labels"]
+    predicted_labels.name = "predicted_labels"
     df = x.reset_index()
+    if isinstance(model, Model) == True:
+        probabilities = predict_with_bert(
+            x,
+            model,
+            max_length=150,
+            additional_features=additional_features,
+            already_encoded=False,
+        )
+    else:
+        probabilities = np.array(model.predict_proba(x))
+    if isinstance(model, Model) == True:
+        model_type = 'bert'
+    else:
+        model_type = 'sklearn'
+    probs_actual = get_probabilities(
+        actual_labels, labels, probabilities, model_type=model_type
+    )
+    probs_predicted = get_probabilities(
+        predicted_labels, labels, probabilities, model_type=model_type
+    )
+    df = df.merge(actual_labels, left_index=True, right_index=True)
+    df = df.merge(predicted_labels, left_index=True, right_index=True)
+    df = df.merge(probs_actual, left_index=True, right_index=True)
+    df = df.merge(probs_predicted, left_index=True, right_index=True)
+    # Deal with any rogue characters
+    df.applymap(lambda x: x.encode('unicode_escape').
+                 decode('utf-8') if isinstance(x, str) else x)
+    df.to_excel(path, index=False)
+    print(f"Successfully completed, written to {path}")
 
-    probabilities = np.array(model.predict_proba(x))
-    probs_actual = get_probabilities(actual_labels, labels, probabilities, model_type = 'sklearn')
-    probs_predicted = get_probabilities(predicted_labels, labels, probabilities, model_type = 'sklearn')
-    df = df.merge(actual_labels, left_index = True, right_index = True)
-    df = df.merge(predicted_labels, left_index = True, right_index = True)
-    df = df.merge(probs_actual, left_index = True, right_index = True)
-    df = df.merge(probs_predicted, left_index = True, right_index = True)
-    df.to_excel(path, index = False)
-    print(f'Successfully completed, written to {path}')
 
 def write_model_analysis(model_name, labels, dataset, path):
     """Writes an Excel file with the performance metrics of each label, as well as the counts of samples for each label.