# This Notebook...
...loads data from "data_raw.csv" which was assembled in "query.ipynb", and then preprocesses the data to make it prepared for training the models. The preprocessing involves filtering, reformatting, construction of the tfidf-ebedding and the DistilBERT embedding, and splitting into train, valid, and test datasets. The final data are saved as three dictionaries "data_y.pkl", "data_tfidf.pkl", and "data_DistilBERT.pkl".

# Dependencies

In [5]:
# Important
#import utils
import numpy as np
import pandas as pd
import math

# Helpers
from itertools import islice
from tqdm import tqdm
import time
from itertools import islice, product
import pickle

# Load Raw Data

In [6]:
df = pd.read_csv("data_raw.csv", index_col=0)

# Filter and modify the data

In [7]:
# Some reviews are float('nan'). These are turned into empty reviews ("").
# There does not seem to be any special cases that are neither string nor float('nan'), but we put "" just in case.
df["review"] = [i if type(i) == str else ("" if math.isnan(i) else "") for i in df["review"]]

# Remove the empty reviews
df = df[df["review"] != ""]

# Set aside the labels (helpfulness)
y = df["helpfulness"] >= 0.5
df = df.drop(["helpfulness"], axis=1)

# Drop indices
df = df.reset_index(drop=True)
y = y.reset_index(drop=True)

# Construct tfidf embedding

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
matrix_tfidf = vectorizer.fit_transform(df['review'])
matrix_tfidf = pd.DataFrame.sparse.from_spmatrix(matrix_tfidf) #"Un"-sparse the matrix

# Combine df and the tfidf matrix
X_tfidf = pd.concat([df.reset_index(drop=True), matrix_tfidf.reset_index(drop=True)], axis=1)
X_tfidf = X_tfidf.drop(["review"], axis=1)

# Construct DistilBERT embedding

In [12]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")

# Create a first X matrix
#X_DistilBERT = pd.concat([df.drop(["review"], axis=1), pd.DataFrame({i:[] for i in range(768)})], axis=1)
X_DistilBERT = pd.DataFrame({i:[float(0)] for i in range(768)})

for review in tqdm(df["review"]):
    # Tokenize and embed
    inputs = tokenizer(review, padding=True, truncation=True, return_tensors="tf")
    outputs = model(inputs)

    # Aggregate token embeddings to get sentence embeddings
    sentence_embedding = tf.reduce_mean(input_tensor=outputs.last_hidden_state, axis=1)

    # Combine df and the DistilBERT embeddings
    X_DistilBERT.loc[X_DistilBERT.index[-1], :] = sentence_embedding.numpy()[0]

X_DistilBERT = pd.concat([df.drop(["review"], axis=1), X_DistilBERT], axis=1)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
100%|██████████| 35567/35567 [3:39:23<00:00,  2.70it/s]


In [18]:
with open("X_DistilBERT.pkl", "wb") as f:
    pickle.dump(X_DistilBERT, f)

In [None]:
# from transformers import DistilBertTokenizer, TFDistilBertModel
# import tensorflow as tf

# # Load tokenizer and tokenize
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# inputs = tokenizer(list(df["review"]), padding=True, truncation=True, return_tensors="tf")

# # Load model and embed
# model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
# outputs = model(inputs)

# # Aggregate token embeddings to get sentence embeddings
# sentence_embeddings = tf.reduce_mean(input_tensor=outputs.last_hidden_state, axis=1)

# # Combine df and the DistilBERT embeddings
# X_DistilBERT = pd.concat([df, pd.DataFrame(sentence_embeddings)], axis=1)
# X_DistilBERT = X_DistilBERT.drop(["review"], axis=1)

In [15]:
# Verification check
assert X_tfidf.shape[0] == X_DistilBERT.shape[0]
assert X_tfidf.shape[0] == df.shape[0]

# Train, validation and test split

In [16]:
# Randomise train, valid, and test indices
n_observations = df.shape[0]
indices = np.random.choice([i for i in range(n_observations)], size=n_observations, replace=False)

# Extract train, valid, and test indices
train_indices = indices[0:round(0.7*n_observations)]
valid_indices = indices[(round(0.7*n_observations)):round(0.85*n_observations)]
test_indices = indices[(round(0.85*n_observations)):]

In [20]:
# Save the labels (note: labels are the same for tfidf data and DistilBERT).
data_y = {"y_train": y[train_indices], "y_valid": y[valid_indices], "y_test": y[test_indices]}
with open("data_y.pkl", "wb") as f:
    pickle.dump(data_y, f)

# Save tfidf predictors
data_tfidf = {"X_train": X_tfidf[train_indices], "X_valid": X_tfidf[valid_indices], "X_test": X_tfidf[test_indices]}
with open("data_tfidf.pkl", "wb") as f:
    pickle.dump(data_tfidf, f)

# Save DistilBERT predictors
data_distilBERT = {"X_train": X_DistilBERT.iloc[train_indices], "X_valid": X_DistilBERT.iloc[valid_indices], "X_test": X_DistilBERT.iloc[test_indices]}
with open("data_DistilBERT.pkl", "wb") as f:
    pickle.dump(data_distilBERT, f)