In [9]:
import pickle
import numpy as np
import pandas as pd
from data.dataprocessor import DataProcessor
from feature_extractors.normal_feature_extractor import NormalFeatureExtractor
from feature_extractors.transformer_feature_extractor import TransformerFeatureExtractor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
seed = 42

### Load `non-europe` Data

In [130]:
X_normal_non_europe = pickle.load(open('pickles/pickled_datasets/seed_42/out_of_domain_X_normal_chunks.pkl', 'rb')).to_numpy()

X_bigbird_non_europe = pickle.load(open('pickles/pickled_datasets/seed_42/out_of_domain_X_bigbird_chunks.pkl', 'rb'))
X_bigbird_fine_tuned_non_europe = pickle.load(open('pickles/pickled_datasets/seed_42/out_of_domain_X_bigbird_fine_tuned_chunks.pkl', 'rb'))

y_non_europe = pickle.load(open('pickles/pickled_datasets/seed_42/out_of_domain_y_chunks.pkl', 'rb'))

In [132]:
X_normal_non_europe_train = X_normal_non_europe[:int(len(X_normal_non_europe)*0.8)]
X_normal_non_europe_test = X_normal_non_europe[int(len(X_normal_non_europe)*0.8):]

X_bigbird_non_europe_train = X_bigbird_non_europe[:int(len(X_bigbird_non_europe)*0.8)]
X_bigbird_non_europe_test = X_bigbird_non_europe[int(len(X_bigbird_non_europe)*0.8):]

X_bigbird_fine_tuned_non_europe_train = X_bigbird_fine_tuned_non_europe[:int(len(X_bigbird_fine_tuned_non_europe)*0.8)]
X_bigbird_fine_tuned_non_europe_test = X_bigbird_fine_tuned_non_europe[int(len(X_bigbird_fine_tuned_non_europe)*0.8):]

y_non_europe_train = y_non_europe[:int(len(y_non_europe)*0.8)]
y_non_europe_test = y_non_europe[int(len(y_non_europe)*0.8):]

### Train Classifiers on `non_europe` Data

In [136]:
clf_normal_non_europe = LogisticRegression(random_state=seed, max_iter=5000, verbose=1).fit(X_normal_non_europe_train, y_non_europe_train)
clf_bigbird_non_europe = LogisticRegression(random_state=seed, max_iter=5000, verbose=1).fit(X_bigbird_non_europe_train, y_non_europe_train)
clf_bigbird_fine_tuned_non_europe = LogisticRegression(random_state=seed, max_iter=5000, verbose=1).fit(X_bigbird_fine_tuned_non_europe_train, y_non_europe_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [None]:
with open('pickles/new_clf_normal_non_europe_seed_42_chunks.pkl', 'wb') as f:
    pickle.dump(clf_normal_non_europe, f)
    
with open('pickles/new_clf_bigbird_non_europe_seed_42_chunks.pkl', 'wb') as f:
    pickle.dump(clf_bigbird_non_europe, f)
    
with open('pickles/new_clf_bigbird_fine_tuned_non_europe_seed_42_chunks.pkl', 'wb') as f:
    pickle.dump(clf_bigbird_fine_tuned_non_europe, f)

In [137]:
with open('pickles/new_clf_normal_non_europe_seed_42_chunks.pkl', 'rb') as f:
    clf_normal_non_europe = pickle.load(f)

with open('pickles/new_clf_bigbird_non_europe_seed_42_chunks.pkl', 'rb') as f:
    clf_bigbird_non_europe = pickle.load(f)
    
with open('pickles/new_clf_bigbird_fine_tuned_non_europe_seed_42_chunks.pkl', 'rb') as f:
    clf_bigbird_fine_tuned_non_europe = pickle.load(f)

### Load `toefl` Data

In [100]:
df_toefl = pd.read_csv('./data/toefl/ETS_Corpus_of_Non-Native_Written_English/data/text/index.csv')

df_toefl_train = pd.read_csv('./data/toefl/ETS_Corpus_of_Non-Native_Written_English/data/text/index-training.csv', header=None, names=['Filename', 'Prompt', 'Language', 'Score Level'])
df_toefl_train['Source'] = ['train'] * len(df_toefl_train)

df_toefl_test = pd.read_csv('./data/toefl/ETS_Corpus_of_Non-Native_Written_English/data/text/index-test.csv', header=None, names=['Filename', 'Prompt', 'Score Level'])
df_toefl_test = df_toefl[df_toefl['Filename'].isin(df_toefl_test['Filename'])]
df_toefl_test['Source'] = ['test'] * len(df_toefl_test)

df_toefl = pd.merge(df_toefl_train, df_toefl_test, how='outer')
df_toefl.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Filename,Prompt,Language,Score Level,Source
0,278.txt,P6,DEU,medium,train
1,348.txt,P1,TUR,high,train
2,666.txt,P2,ZHO,medium,train
3,733.txt,P6,TEL,medium,train
4,976.txt,P2,ARA,low,train


In [101]:
df_toefl['Language'].unique()

array(['DEU', 'TUR', 'ZHO', 'TEL', 'ARA', 'SPA', 'HIN', 'JPN', 'KOR',
       'FRA', 'ITA'], dtype=object)

In [102]:
def read_file_contents(filename):
    try:
        with open(f'./data/toefl/ETS_Corpus_of_Non-Native_Written_English/data/text/responses/original/{filename}', 'r') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        return None
    
def assign_label(iso_lang):
    language2label = {
        'JPN': 0,
        'ARA': 0,
        'KOR': 0,
        'HIN': 0,
        'TEL': 0,
        'ZHO': 0,
        'DEU': 1,
        'FRA': 7,
        'ITA': 10,
        'SPA': 20,
        'TUR': 22
    }
    return language2label[iso_lang]

df_toefl['Text'] = df_toefl['Filename'].apply(read_file_contents)
df_toefl['Label'] = df_toefl['Language'].apply(assign_label)
df_toefl = df_toefl[df_toefl['Label'] != 0]
df_toefl.head()

Unnamed: 0,Filename,Prompt,Language,Score Level,Source,Text,Label
0,278.txt,P6,DEU,medium,train,IThe importance and popularity of travelling i...,1
1,348.txt,P1,TUR,high,train,"It is an important decision, how to plan your ...",22
5,1612.txt,P6,SPA,medium,train,"In my opinion, travel in group with a tour gui...",20
6,2024.txt,P3,DEU,medium,train,I thing the statement ''Young people nowadays ...,1
7,2664.txt,P2,DEU,high,train,Whether or not young people enjoy life more th...,1


In [103]:
df_toefl.Text.apply(lambda x: len(x.split(' '))).mean()

322.124

### Extract Features from `toefl` Data

In [104]:
normal_feature_extractor = pickle.load(open('pickles/normal_feature_extractor_seed_42_chunks.pkl', 'rb'))
big_bird_non_europe_feature_extractor = TransformerFeatureExtractor('google/bigbird-roberta-base', 2048)
big_bird_fine_tuned_non_europe_feature_extractor = TransformerFeatureExtractor('fine_tuned_models/out_of_domain_bigbird_roberta_base_clean_chunks', 2048)

def insert_grammar_features(all_features: np.array, grammar_features: np.array) -> np.array:
    return np.concatenate((all_features[:, :2300], grammar_features, all_features[:, 2300:]), axis=1)

X_normal_toefl = normal_feature_extractor.transform(df_toefl.Text.to_list(), grammar_mistakes=False).to_numpy()
grammar_features = normal_feature_extractor.get_grammar_features(df_toefl.Text.to_list())

X_normal_toefl = insert_grammar_features(X_normal_toefl, grammar_features)

X_bigbird = big_bird_non_europe_feature_extractor.transform(df_toefl.Text.to_list())
X_bigbird_fine = big_bird_fine_tuned_non_europe_feature_extractor.transform(df_toefl.Text.to_list())

Trying to unpickle estimator CountVectorizer from version 1.0.2 when using version 1.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
normalizer.cc(50) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BigBirdModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassi

Creating word ngram features...
Creating char ngram features...


Calculating edit distance: 100%|███████████| 5000/5000 [00:11<00:00, 422.58it/s]
Extracting Substitution Features: 100%|████| 5000/5000 [00:15<00:00, 319.34it/s]
Extracting Function Word Features: 5000it [00:07, 652.80it/s]
Extracting POS Features: 5000it [01:09, 72.28it/s]
Extracting Average Sentence Length: 100%|█| 5000/5000 [00:00<00:00, 18608.85it/s


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Extracting Grammar Features: 100%|██████████| 5000/5000 [07:58<00:00, 10.44it/s]
Extracting features: 100%|██████████████████| 5000/5000 [11:02<00:00,  7.54it/s]
Extracting features: 100%|██████████████████| 5000/5000 [11:24<00:00,  7.31it/s]


In [106]:
# with open('pickles/new_normal_toefl_seed_42_chunks.pkl', 'wb') as f:
#     pickle.dump(X_normal_toefl, f)
    
# with open('pickles/new_bigbird_toefl_seed_42_chunks.pkl', 'wb') as f:
#     pickle.dump(X_bigbird, f)
    
# with open('pickles/new_bigbird_fine_tuned_toefl_seed_42_chunks.pkl', 'wb') as f:
#     pickle.dump(X_bigbird_fine, f)

In [107]:
with open('pickles/new_normal_toefl_seed_42_chunks.pkl', 'rb') as f:
    X_normal_toefl = pickle.load(f)
    
with open('pickles/new_bigbird_toefl_seed_42_chunks.pkl', 'rb') as f:
    X_bigbird_toefl = pickle.load(f)
    
with open('pickles/new_bigbird_fine_tuned_toefl_seed_42_chunks.pkl', 'rb') as f:
    X_bigbird_fine_toefl = pickle.load(f)

In [120]:
toefl_train_ix = df_toefl.loc[df_toefl['Source'] == 'train'].index.to_list()
toefl_test_ix = df_toefl.loc[df_toefl['Source'] == 'test'].index.to_list()

X_normal_toefl_train = X_normal_non_europe[toefl_train_ix]
X_normal_toefl_test = X_normal_non_europe[toefl_test_ix]

X_bigbird_toefl_train = X_bigbird_non_europe[toefl_train_ix]
X_bigbird_toefl_test = X_bigbird_non_europe[toefl_test_ix]

X_bigbird_fine_tuned_toefl_train = X_bigbird_fine_tuned_non_europe[toefl_train_ix]
X_bigbird_fine_tuned_toefl_test = X_bigbird_fine_tuned_non_europe[toefl_test_ix]

y_toefl_train = df_toefl[df_toefl['Source'] == 'train']['Label'].to_list()
y_toefl_test = df_toefl[df_toefl['Source'] == 'test']['Label'].to_list()

In [121]:
clf_normal_toefl = LogisticRegression(random_state=seed, max_iter=5000, verbose=1).fit(X_normal_toefl_train, y_toefl_train)
clf_bigbird_toefl = LogisticRegression(random_state=seed, max_iter=5000, verbose=1).fit(X_bigbird_toefl_train, y_toefl_train)
clf_bigbird_fine_tuned_toefl = LogisticRegression(random_state=seed, max_iter=5000, verbose=1).fit(X_bigbird_fine_tuned_toefl_train, y_toefl_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


  F =   54889.456320398887     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        17687     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  9.32527D+04    |proj g|=  4.15286D+03

At iterate   50    f=  2.77199D+04    |proj g|=  1.43165D+02

At iterate  100    f=  2.50174D+04    |proj g|=  4.17377D+02

At iterate  150    f=  2.42141D+04    |proj g|=  2.79269D+02

At iterate  200    f=  2.39562D+04    |proj g|=  6.26058D+01

At iterate  250    f=  2.38600D+04    |proj g|=  1.72157D+01

At iterate  300    f=  2.38233D+04    |proj g|=  6.10771D+01

At iterate  350    f=  2.38076D+04    |proj g|=  9.85891D+00

At iterate  400    f=  2.38009D+04    |proj g|=  1.15009D+01

At iterate  450    f=  2.37973D+04    |proj g|=  6.37316D+00

At iterate  500    f=  2.37951D+04    |proj g|=  6.11802D+00

At iterate  550    f=  2.37935D+04    |proj g|=  2.23273

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.2s finished


In [163]:
df_scores = pd.DataFrame([], columns=['model', 'train', 'test', 'accuracy'])
i = 0
for outer_data in ['non_europe', 'toefl']:
    for inner_data in ['non_europe', 'toefl']:
        for clf in ['normal', 'bigbird', 'bigbird_fine_tuned']:
            eval(f"exec('pred_{clf}_{outer_data}_{inner_data} = clf_{clf}_{outer_data}.predict(X_{clf}_{inner_data}_test)')")
            if inner_data == 'non_europe':
                y_true = y_non_europe_test
            else:
                y_true = y_toefl_test
            acc = accuracy_score(y_true=y_true, y_pred=eval(f"pred_{clf}_{outer_data}_{inner_data}"))
#             print(f"Accuracy: {clf} \t" + ("\t" if 'fine' not in clf else "") + 
#                   f"({outer_data}) -> ({inner_data}) \t" + 
#                   ("\t" if inner_data == outer_data and inner_data == "toefl" else "") + 
#                   f" = {round(acc, 3):.3f}")
            df_scores.loc[i] = [clf, outer_data, inner_data, acc]
            i += 1

df_scores

Unnamed: 0,model,train,test,accuracy
0,normal,non_europe,non_europe,0.461807
1,bigbird,non_europe,non_europe,0.492335
2,bigbird_fine_tuned,non_europe,non_europe,0.644567
3,normal,non_europe,toefl,0.054
4,bigbird,non_europe,toefl,0.044
5,bigbird_fine_tuned,non_europe,toefl,0.052
6,normal,toefl,non_europe,0.042765
7,bigbird,toefl,non_europe,0.045186
8,bigbird_fine_tuned,toefl,non_europe,0.039672
9,normal,toefl,toefl,0.172


In [168]:
print(df_scores.pivot(index=["model", "train"], columns=["test"], values="accuracy").round(3).to_latex())

\begin{tabular}{llrr}
\toprule
       & test &  non\_europe &  toefl \\
model & train &             &        \\
\midrule
bigbird & non\_europe &       0.492 &  0.044 \\
       & toefl &       0.045 &  0.190 \\
bigbird\_fine\_tuned & non\_europe &       0.645 &  0.052 \\
       & toefl &       0.040 &  0.170 \\
normal & non\_europe &       0.462 &  0.054 \\
       & toefl &       0.043 &  0.172 \\
\bottomrule
\end{tabular}

