In [2]:
import pickle
from data.dataprocessor import DataProcessor
from sklearn.metrics import confusion_matrix
from feature_extractors.normal_feature_extractor import NormalFeatureExtractor
from feature_extractors.transformer_feature_extractor import TransformerFeatureExtractor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import os
import numpy as np
import matplotlib.pyplot as plt

seed = 42

### Create `europe` Data

In [3]:
dataprocessor = DataProcessor('google/bigbird-roberta-base')

dataprocessor.discover_chunks('data/balanced/seed_42/europe_data')

europe_data, _ = dataprocessor.get_train_test_datasets(train_size=1)

europe_df = europe_data.get_dataframe()

## Train on `non_europe` Data and Evaluate on `europe` Data

### Extract Features from `europe` Data

In [7]:
normal_feature_extractor = pickle.load(open('pickles/normal_feature_extractor_seed_42_chunks.pkl', 'rb'))
big_bird_fine_tuned_non_europe_feature_extractor = TransformerFeatureExtractor('fine_tuned_models/out_of_domain_bigbird_roberta_base_clean_chunks', 2048)

def insert_grammar_features(all_features: np.array, grammar_features: np.array) -> np.array:
    return np.concatenate((all_features[:, :2300], grammar_features, all_features[:, 2300:]), axis=1)

X_normal_europe = normal_feature_extractor.transform(europe_df.text.to_list(), grammar_mistakes=False).to_numpy()
grammar_features = normal_feature_extractor.get_grammar_features(europe_df.text.to_list())

X_normal_europe = insert_grammar_features(X_normal_europe, grammar_features)

X_bigbird = big_bird_fine_tuned_non_europe_feature_extractor.transform(europe_df.text.to_list())



Some weights of the model checkpoint at fine_tuned_models/out_of_domain_bigbird_roberta_base_clean_chunks were not used when initializing BigBirdModel: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Creating word ngram features...
Creating char ngram features...


Calculating edit distance: 100%|██████████| 5254/5254 [01:28<00:00, 59.35it/s]
Extracting Substitution Features:  13%|█▎        | 678/5254 [00:13<01:27, 52.10it/s]

### Import `non_europe` Data

In [5]:
X_normal_non_europe = pickle.load(open('pickles/pickled_datasets/seed_42/out_of_domain_X_normal_chunks.pkl', 'rb')).to_numpy()

X_bigbird_fine_tuned_non_europe = pickle.load(open('pickles/pickled_datasets/seed_42/out_of_domain_X_bigbird_fine_tuned_chunks.pkl', 'rb'))

y_non_europe = pickle.load(open('pickles/pickled_datasets/seed_42/out_of_domain_y_chunks.pkl', 'rb'))

### Train Classifiers on `non_europe` Data

In [6]:
clf_normal_non_europe = LogisticRegression(random_state=seed, max_iter=5000, verbose=1).fit(X_normal_non_europe, y_non_europe)
clf_bigbird_non_europe = LogisticRegression(random_state=seed, max_iter=5000, verbose=1).fit(X_bigbird_fine_tuned_non_europe, y_non_europe)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =       119301     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.16568D+05    |proj g|=  6.11613D+04


 This problem is unconstrained.



At iterate   50    f=  7.78330D+04    |proj g|=  5.76286D+04

At iterate  100    f=  6.33450D+04    |proj g|=  1.71410D+04

At iterate  150    f=  5.59018D+04    |proj g|=  1.29523D+04

At iterate  200    f=  4.99981D+04    |proj g|=  2.92420D+04

At iterate  250    f=  4.64432D+04    |proj g|=  1.84267D+04

At iterate  300    f=  4.32104D+04    |proj g|=  1.63115D+04

At iterate  350    f=  3.89927D+04    |proj g|=  1.38743D+04

At iterate  400    f=  3.52083D+04    |proj g|=  1.22159D+04

At iterate  450    f=  3.26309D+04    |proj g|=  7.90397D+03

At iterate  500    f=  3.04920D+04    |proj g|=  9.82527D+03

At iterate  550    f=  2.82607D+04    |proj g|=  5.46332D+03

At iterate  600    f=  2.62642D+04    |proj g|=  1.22055D+04

At iterate  650    f=  2.45569D+04    |proj g|=  8.67473D+03

At iterate  700    f=  2.25384D+04    |proj g|=  5.04658D+03

At iterate  750    f=  2.14864D+04    |proj g|=  7.92410D+03

At iterate  800    f=  2.03114D+04    |proj g|=  5.32900D+03

At iter

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 22.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate   50    f=  3.60509D+04    |proj g|=  1.88118D+02

At iterate  100    f=  3.32251D+04    |proj g|=  8.77817D+01

At iterate  150    f=  3.23217D+04    |proj g|=  4.02327D+01

At iterate  200    f=  3.20197D+04    |proj g|=  6.21525D+01

At iterate  250    f=  3.19074D+04    |proj g|=  2.38797D+01

At iterate  300    f=  3.18625D+04    |proj g|=  1.94149D+01

At iterate  350    f=  3.18436D+04    |proj g|=  2.98009D+01

At iterate  400    f=  3.18351D+04    |proj g|=  1.75526D+01

At iterate  450    f=  3.18299D+04    |proj g|=  1.61658D+01

At iterate  500    f=  3.18268D+04    |proj g|=  1.02521D+01

At iterate  550    f=  3.18246D+04    |proj g|=  3.30755D+00

At iterate  600    f=  3.18230D+04    |proj g|=  2.24186D+00

At iterate  650    f=  3.18216D+04    |proj g|=  1.14586D+01

At iterate  700    f=  3.18204D+04    |proj g|=  2.79345D+00

At iterate  750    f=  3.18193D+04    |proj g|=  3.60076D+00

At iterate  800    f=  3.18183D+04    |proj g|=  1.76230D+01

At iter

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.1min finished


### Evaluate on `europe` Data with `non_europe` Classifiers

In [7]:
pred_normal_europe = clf_normal_non_europe.predict(X_normal_europe)
pred_bigbird_europe = clf_bigbird_non_europe.predict(X_bigbird_fine_tuned_europe)

accuracy_normal_europe = accuracy_score(europe_df['label'], pred_normal_europe)
accuracy_bigbird_europe = accuracy_score(europe_df['label'], pred_bigbird_europe)


print(f'Accuracy normal Europe: {accuracy_normal_europe}')
print(f'Accuracy Big Bird fine-tuned Europe: {accuracy_bigbird_europe}')

Accuracy normal Europe: 0.04701180053292729
Accuracy Big Bird fine-tuned Europe: 0.04225352112676056


## Train on `europe` Data and Evaluate on `europe` Data

#### Create Input

Fit a normal feature extractor on the `europe` data

In [6]:
normal_feature_extractor = NormalFeatureExtractor()
normal_feature_extractor.fit(europe_df.text.to_list())

Fitting Vectorizers


Discovering Substitutions: 100%|██████████| 5254/5254 [01:24<00:00, 62.05it/s]
Discovering Grammar Mistakes: 100%|██████████| 5254/5254 [2:15:49<00:00,  1.55s/it]  
Discovering POS Ngrams: 100%|██████████| 5254/5254 [06:51<00:00, 12.76it/s]


In [9]:
with open('pickles/normal_feature_extractor_europe.pkl', 'wb') as f:
    pickle.dump(normal_feature_extractor, f)

Extract features from text

In [10]:
X_normal_europe = normal_feature_extractor.transform(europe_df.text.to_list())

Creating word ngram features...
Creating char ngram features...


Calculating edit distance: 100%|██████████| 5254/5254 [01:20<00:00, 65.36it/s]
Extracting Substitution Features: 100%|██████████| 5254/5254 [01:32<00:00, 56.90it/s]
Extracting Grammar Features: 100%|██████████| 5254/5254 [00:22<00:00, 231.40it/s]
Extracting Function Word Features: 5254it [00:54, 97.04it/s] 
Extracting POS Features: 5254it [07:39, 11.44it/s]
Extracting Average Sentence Length: 100%|██████████| 5254/5254 [00:01<00:00, 3461.62it/s]


In [12]:
with open('pickles/pickled_datasets/seed_42/X_normal_europe.pkl', 'wb') as f:
    pickle.dump(X_normal_europe, f)

Create embeddings from text

In [26]:
bigbird_feature_extractor = TransformerFeatureExtractor('google/bigbird-roberta-base', 2048)
X_bigbird_europe = bigbird_feature_extractor.transform(europe_df.text.to_list())

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (4531 

In [28]:
with open('pickles/pickled_datasets/seed_42/X_bigbird_europe.pkl', 'wb') as f:
    pickle.dump(X_bigbird_europe, f)

In [31]:
bigbird_non_europe_fine_tuned_feature_extractor = TransformerFeatureExtractor('fine_tuned_models/out_of_domain_bigbird_roberta_base_clean_chunks', 2048)
X_bigbird_fine_tuned_europe = bigbird_non_europe_fine_tuned_feature_extractor.transform(europe_df.text.to_list())

Some weights of the model checkpoint at fine_tuned_models/out_of_domain_bigbird_roberta_base_clean_chunks were not used when initializing BigBirdModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (4531 > 4096). Running this sequence through the model will result in indexing errors
Extracting features: 100%|██████████| 5254/5254 [22:59<00:00,  3.81it/s]


In [32]:
with open('pickles/pickled_datasets/seed_42/X_bigbird_fine_tuned_europe.pkl', 'wb') as f:
    pickle.dump(X_bigbird_fine_tuned_europe, f)

#### Train Classifiers

In [33]:
X_normal_europe_train = X_normal_europe[:int(len(X_normal_europe)*0.8)]
X_normal_europe_test = X_normal_europe[int(len(X_normal_europe)*0.8):]

X_bigbird_europe_train = X_bigbird_europe[:int(len(X_bigbird_europe)*0.8)]
X_bigbird_europe_test = X_bigbird_europe[int(len(X_bigbird_europe)*0.8):]

X_bigbird_fine_tuned_europe_train = X_bigbird_fine_tuned_europe[:int(len(X_bigbird_fine_tuned_europe)*0.8)]
X_bigbird_fine_tuned_europe_test = X_bigbird_fine_tuned_europe[int(len(X_bigbird_fine_tuned_europe)*0.8):]

y_europe_train = europe_df.label[:int(len(europe_df)*0.8)]
y_europe_test = europe_df.label[int(len(europe_df)*0.8):]

clf_normal_europe = LogisticRegression(random_state=seed, max_iter=10000).fit(X_normal_europe_train, y_europe_train)
clf_bigbird_europe = LogisticRegression(random_state=seed, max_iter=10000).fit(X_bigbird_europe_train, y_europe_train)
clf_bigbird_fine_tuned_europe = LogisticRegression(random_state=seed, max_iter=10000).fit(X_bigbird_fine_tuned_europe_train, y_europe_train)


#### Evaluate on `europe` Data 

In [34]:
pred_normal_europe = clf_normal_europe.predict(X_normal_europe_test)
pred_bigbird_europe = clf_bigbird_europe.predict(X_bigbird_europe_test)
pred_bigbird_fine_tuned_europe = clf_bigbird_fine_tuned_europe.predict(X_bigbird_fine_tuned_europe_test)

accuracy_normal = accuracy_score(y_europe_test, pred_normal_europe)
accuracy_bigbird = accuracy_score(y_europe_test, pred_bigbird_europe)
accuracy_bigbird_fine_tuned = accuracy_score(y_europe_test, pred_bigbird_fine_tuned_europe)

accuracy_normal, accuracy_bigbird, accuracy_bigbird_fine_tuned

(0.21027592768791628, 0.7678401522359658, 0.857278782112274)