Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

90 bert analysis #106

Merged
merged 13 commits into from
Jun 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
835 changes: 462 additions & 373 deletions coverage.xml

Large diffs are not rendered by default.

55 changes: 0 additions & 55 deletions current_best_multilabel/bert.txt

This file was deleted.

Binary file removed current_best_multilabel/svc_merged.sav
Binary file not shown.
79 changes: 0 additions & 79 deletions current_best_multilabel/svc_merged.txt

This file was deleted.

Binary file removed current_best_multilabel/svc_merged_perf.xlsx
Binary file not shown.
6 changes: 3 additions & 3 deletions pxtextmining/factories/factory_data_load_and_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def bert_data_to_dataset(
tokenizer(
list(X["FFT answer"]),
truncation=True,
padding=True,
padding='max_length',
max_length=max_length,
return_tensors="tf",
)
Expand All @@ -76,7 +76,7 @@ def bert_data_to_dataset(
tokenizer(
list(X),
truncation=True,
padding=True,
padding='max_length',
max_length=max_length,
return_tensors="tf",
)
Expand Down Expand Up @@ -179,7 +179,7 @@ def onehot(df, col_to_onehot):
Returns:
(pd.DataFrame): One-hot encoded data
"""
encoder = OneHotEncoder(sparse=False)
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
col_encoded = encoder.fit_transform(df[[col_to_onehot]])
return col_encoded

Expand Down
43 changes: 43 additions & 0 deletions pxtextmining/factories/factory_predict_unlabelled_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pxtextmining.factories.factory_data_load_and_split import (
bert_data_to_dataset,
remove_punc_and_nums,
clean_empty_features
)
from pxtextmining.params import minor_cats

Expand Down Expand Up @@ -78,6 +79,48 @@ def predict_multilabel_sklearn(
preds_df["labels"] = preds_df.apply(get_labels, args=(labels,), axis=1)
return preds_df

def predict_multilabel_bert(
data,
model,
labels = minor_cats,
additional_features = False,
label_fix = True
):
"""Conducts basic preprocessing to remove blank text.
Utilises a pretrained transformer-based machine learning model to make multilabel predictions on the cleaned text.
Also takes the class with the highest predicted probability as the predicted class in cases where no class has
been predicted, if fix_no_labels = True.

Args:
text (pd.Series OR pd.DataFrame): DataFrame or Series containing data to be processed and utilised for predictions. Must be DataFrame with columns 'FFT answer' and 'FFT_q_standardised' if additional_features = True
model (tf.Model): Trained tensorflow estimator able to perform multilabel classification.
labels (list, optional): List containing target labels. Defaults to major_cats.
additional_features (bool, optional): Whether or not FFT_q_standardised is included in data. Defaults to False.
label_fix (bool, optional): Whether or not the class with the highest probability is taken as the predicted class in cases where no classes are predicted. Defaults to True.

Returns:
(pd.DataFrame): DataFrame containing one hot encoded predictions, and a column with a list of the predicted labels.
"""
if additional_features == False:
text = pd.Series(data)
else:
text = data['FFT answer']
processed_text = clean_empty_features(text)
if additional_features == False:
final_data = processed_text
else:
final_data = pd.merge(processed_text, data['FFT_q_standardised'], how='left', on='Comment ID')
y_probs = predict_with_bert(final_data, model, additional_features=additional_features,
already_encoded=False)
y_binary = turn_probs_into_binary(y_probs)
if label_fix == True:
predictions = fix_no_labels(y_binary, y_probs, model_type = 'bert')
else:
predictions = y_binary
preds_df = pd.DataFrame(predictions, index=processed_text.index, columns=labels)
preds_df["labels"] = preds_df.apply(get_labels, args=(labels,), axis=1)
return preds_df

def predict_multiclass_bert(x, model, additional_features, already_encoded):
"""Makes multiclass predictions using a transformer-based model. Can encode the data if not already encoded.

Expand Down
87 changes: 64 additions & 23 deletions pxtextmining/factories/factory_write_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,14 @@
import numpy as np
import pandas as pd

from pxtextmining.factories.factory_predict_unlabelled_text import get_labels, predict_multilabel_sklearn, predict_with_probs, get_probabilities
from tensorflow.keras import Model, Sequential
from pxtextmining.factories.factory_predict_unlabelled_text import (
get_labels,
predict_multilabel_sklearn,
predict_multilabel_bert,
get_probabilities,
predict_with_bert
)
from pxtextmining.factories.factory_model_performance import parse_metrics_file


Expand All @@ -24,15 +30,18 @@ def write_multilabel_models_and_metrics(models, model_metrics, path):
if isinstance(models[i], (Sequential, Model)):
models[i].save(fullpath)
else:
modelpath = os.path.join(path, model_name + '.sav')
modelpath = os.path.join(path, model_name + ".sav")
pickle.dump(models[i], open(modelpath, "wb"))
# Write performance metrics file
txtpath = os.path.join(path, model_name + '.txt')
txtpath = os.path.join(path, model_name + ".txt")
with open(txtpath, "w") as file:
file.write(model_metrics[i])
print(f"{len(models)} models have been written to {path}")

def write_model_preds(x, y, model, labels, additional_features = True, path = 'labels.xlsx'):

def write_model_preds(
x, y, model, labels, additional_features=True, path="labels.xlsx"
):
"""Writes an Excel file to enable easier analysis of model outputs using the test set. Columns of the Excel file are: comment_id, actual_labels, predicted_labels, actual_label_probs, and predicted_label_probs.

Currently only works with sklearn models.
Expand All @@ -45,27 +54,59 @@ def write_model_preds(x, y, model, labels, additional_features = True, path = 'l
additional_features (bool, optional): Whether or not FFT_q_standardised is included in data. Defaults to True.
path (str, optional): Filename for the outputted file. Defaults to 'labels.xlsx'.
"""
actual_labels = pd.DataFrame(y, columns = labels).apply(get_labels, args=(labels,), axis=1)
actual_labels.name = 'actual_labels'
predicted_labels = predict_multilabel_sklearn(x,
model,
labels=labels,
additional_features = additional_features,
label_fix = True,
enhance_with_probs=True
).reset_index()['labels']
predicted_labels.name = 'predicted_labels'
actual_labels = pd.DataFrame(y, columns=labels).apply(
get_labels, args=(labels,), axis=1
)
actual_labels.name = "actual_labels"
if isinstance(model, Model) == True:
predicted_labels = predict_multilabel_bert(
x,
model,
labels=labels,
additional_features=additional_features,
label_fix=True,
).reset_index()["labels"]
else:
predicted_labels = predict_multilabel_sklearn(
x,
model,
labels=labels,
additional_features=additional_features,
label_fix=True,
enhance_with_probs=True,
).reset_index()["labels"]
predicted_labels.name = "predicted_labels"
df = x.reset_index()
if isinstance(model, Model) == True:
probabilities = predict_with_bert(
x,
model,
max_length=150,
additional_features=additional_features,
already_encoded=False,
)
else:
probabilities = np.array(model.predict_proba(x))
if isinstance(model, Model) == True:
model_type = 'bert'
else:
model_type = 'sklearn'
probs_actual = get_probabilities(
actual_labels, labels, probabilities, model_type=model_type
)
probs_predicted = get_probabilities(
predicted_labels, labels, probabilities, model_type=model_type
)
df = df.merge(actual_labels, left_index=True, right_index=True)
df = df.merge(predicted_labels, left_index=True, right_index=True)
df = df.merge(probs_actual, left_index=True, right_index=True)
df = df.merge(probs_predicted, left_index=True, right_index=True)
# Deal with any rogue characters
df.applymap(lambda x: x.encode('unicode_escape').
decode('utf-8') if isinstance(x, str) else x)
df.to_excel(path, index=False)
print(f"Successfully completed, written to {path}")

probabilities = np.array(model.predict_proba(x))
probs_actual = get_probabilities(actual_labels, labels, probabilities, model_type = 'sklearn')
probs_predicted = get_probabilities(predicted_labels, labels, probabilities, model_type = 'sklearn')
df = df.merge(actual_labels, left_index = True, right_index = True)
df = df.merge(predicted_labels, left_index = True, right_index = True)
df = df.merge(probs_actual, left_index = True, right_index = True)
df = df.merge(probs_predicted, left_index = True, right_index = True)
df.to_excel(path, index = False)
print(f'Successfully completed, written to {path}')

def write_model_analysis(model_name, labels, dataset, path):
"""Writes an Excel file with the performance metrics of each label, as well as the counts of samples for each label.
Expand Down
Loading