In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.decomposition import PCA
import sklearn as skl

# For loading huggingface stuff
from sentence_transformers import SentenceTransformer
from extractors.chartok import CharacterTok
from extractors.berttok import BertTokenizer

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Feature Extraction

In [3]:
# Possible values: 
# labse - makes use of LaBSE for multilingual sentence embeddings
# char - makes use of a character-level tokenization scheme.
FEATURE_EXTRACTOR = "bert" 

In [4]:
# Load the dataset from the corresponding csv

DATASET_PATHS = ["data/test", "data/validation", "data/training"]


In [5]:
# Load the feature extractor specified in embedding generator
feature_extractor = None 

if FEATURE_EXTRACTOR == "labse":
    # LaBSE found here: https://huggingface.co/sentence-transformers/LaBSE
    feature_extractor = SentenceTransformer('sentence-transformers/LaBSE')
elif FEATURE_EXTRACTOR == "char":
    feature_extractor = CharacterTok()
elif FEATURE_EXTRACTOR == "bert":
    feature_extractor = BertTokenizer()
else: 
    pass 

In [8]:
for db_path in DATASET_PATHS:
    loaded_df = pd.read_csv(db_path + ".csv")

    # Prepare sentence list
    sentences = loaded_df["text"]

    # Apply the feature extractor to a target dataset

    encodings = []

    if FEATURE_EXTRACTOR == "labse":
        feature_extractor : SentenceTransformer = feature_extractor
        encodings = feature_extractor.encode(sentences)
    elif FEATURE_EXTRACTOR == "char":
        feature_extractor : CharacterTok = feature_extractor 
        encodings = feature_extractor.encode(sentences)
    elif FEATURE_EXTRACTOR == "bert":
        feature_extractor : BertTokenizer = feature_extractor 
        encodings = feature_extractor.encode(sentences)
    else: 
        pass

    # Store individual floats in the vector in their own columns.

    transposed = encodings.T
    new_df = loaded_df.copy(deep=True)

    for i in range(0, len(encodings[0])):
        new_df['_e' + str(i)] = transposed[i]

    # Save the feature list to a new dataset
    NEW_FILE_PATH = db_path + "_" + FEATURE_EXTRACTOR
    new_df.to_csv(NEW_FILE_PATH + ".csv", index=False)

[[-13.26652 ]
 [-13.20869 ]
 [-12.887813]
 ...
 [-11.577044]
 [-11.79921 ]
 [-10.265955]]
