@context: UCU2023

## Preliminaries

In [2]:
import os
import numpy as np
import pandas as pd
from collections import Counter

import ast
from datasets import load_dataset

In [3]:
np.random.seed(42)

## Loading the dataset

In [10]:
# pathes
path_data = '/mnt/green-efs/andrii.liubonko/may/ucu/data'
path_data_save= '/mnt/green-efs/andrii.liubonko/may/ucu/data/final/'
os.listdir(path_data)

['ds_train__words_pos.csv',
 'final',
 'ds_sample_230202.cvs',
 'ds_words_pos',
 'openwebtext_10k_sent_with_attributes_clean.csv',
 'ds_test__words_pos.csv',
 'ds_val__words_pos.csv',
 'openwebtext_10k_sent_clean_3.csv']

In [11]:
ds_path = os.path.join(path_data, 'openwebtext_10k_sent_with_attributes_clean.csv')
df_data = pd.read_csv(ds_path)
df_data

Unnamed: 0,text,Number of symbols,Number of words,Parts of speech,Sentiment analysis,Tense
0,A magazine supplement with an image of Adolf H...,113,22,"['DET', 'NOUN', 'NOUN', 'ADP', 'DET', 'NOUN', ...",1,{'present'}
1,"No law bans “ Mein Kampf ” in Germany , but th...",119,24,"['DET', 'NOUN', 'VERB', 'PUNCT', 'PROPN', 'PRO...",1,{'present'}
2,The city that was the center of Adolf Hitler ’...,236,43,"['DET', 'NOUN', 'PRON', 'AUX', 'DET', 'NOUN', ...",0,"{'present', 'past'}"
3,"What it does n’t have , nor has it since 1945 ...",148,31,"['PRON', 'PRON', 'AUX', 'PROPN', 'AUX', 'PUNCT...",0,{'present'}
4,The latest attempt to publish excerpts fizzled...,189,30,"['DET', 'ADJ', 'NOUN', 'PART', 'VERB', 'NOUN',...",0,{'past'}
...,...,...,...,...,...,...
369837,"After having bought some Imperial officers , h...",132,25,"['ADP', 'AUX', 'VERB', 'DET', 'ADJ', 'NOUN', '...",1,{'past'}
369838,He came back to a depopulated and devastated R...,72,14,"['PRON', 'VERB', 'ADV', 'ADP', 'DET', 'VERB', ...",0,{'past'}
369839,Subsequently the Pope followed a policy of sub...,233,43,"['ADV', 'DET', 'PROPN', 'VERB', 'DET', 'NOUN',...",1,{'past'}
369840,Pope Clement VII is remembered for having orde...,156,29,"['PROPN', 'PROPN', 'PROPN', 'AUX', 'VERB', 'AD...",1,"{'present', 'past'}"


## Transformations

In [16]:
df_data = df_data[df_data.apply(lambda row: row['Tense'] != 'set()', axis=1)]
df_data.loc[:, 'Tense'] = df_data['Tense'].apply(ast.literal_eval)

In [17]:
df_data = df_data[df_data.apply(lambda row: row['Tense'] != 'Not detected', axis=1)]
df_data = df_data[df_data.apply(lambda row: len(row['Tense']) == 1, axis=1)]
df_data['Tense'] = df_data['Tense'].apply(lambda x: list(x)[0])

In [18]:
df_data

Unnamed: 0,text,Number of symbols,Number of words,Parts of speech,Sentiment analysis,Tense
0,A magazine supplement with an image of Adolf H...,113,22,"['DET', 'NOUN', 'NOUN', 'ADP', 'DET', 'NOUN', ...",1,present
1,"No law bans “ Mein Kampf ” in Germany , but th...",119,24,"['DET', 'NOUN', 'VERB', 'PUNCT', 'PROPN', 'PRO...",1,present
3,"What it does n’t have , nor has it since 1945 ...",148,31,"['PRON', 'PRON', 'AUX', 'PROPN', 'AUX', 'PUNCT...",0,present
4,The latest attempt to publish excerpts fizzled...,189,30,"['DET', 'ADJ', 'NOUN', 'PART', 'VERB', 'NOUN',...",0,past
5,But in Germany — where keeping a tight lid on ...,223,41,"['CCONJ', 'ADP', 'PROPN', 'PUNCT', 'SCONJ', 'V...",1,present
...,...,...,...,...,...,...
369835,"The innumerable series of murders , rapes and ...",124,20,"['DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'PUNCT',...",0,past
369836,Clement was kept as a prisoner in Castel Sant’...,69,13,"['PROPN', 'AUX', 'VERB', 'ADP', 'DET', 'NOUN',...",1,past
369837,"After having bought some Imperial officers , h...",132,25,"['ADP', 'AUX', 'VERB', 'DET', 'ADJ', 'NOUN', '...",1,past
369838,He came back to a depopulated and devastated R...,72,14,"['PRON', 'VERB', 'ADV', 'ADP', 'DET', 'VERB', ...",0,past


In [19]:
print(Counter(df_data['Tense']))
print(Counter(df_data['Sentiment analysis']))

Counter({'past': 100743, 'present': 87010, 'future': 29650, 'Not detected': 13418})
Counter({1: 149917, 0: 48833, 2: 32071})


In [24]:
df_data = df_data[df_data.apply(lambda row: row['Tense'] != 'Not detected', axis=1)]

In [25]:
indexes = (df_data['Sentiment analysis'] == 2) | (df_data['Tense'] == 'future')
indexes_not = (df_data['Sentiment analysis'] != 2) & (df_data['Tense'] != 'future')

df_part = df_data[indexes]

n_final = 110000
n_rest = n_final - len(df_part)

df_rest = df_data[indexes_not]

print(len(df_rest))
print(len(df_part))

161371
56032


In [26]:
print(Counter(df_part['Sentiment analysis']))
print(Counter(df_part['Tense']))

Counter({2: 31029, 1: 19511, 0: 5492})
Counter({'future': 29650, 'present': 15163, 'past': 11219})


In [27]:
print(Counter(df_rest['Sentiment analysis']))
print(Counter(df_rest['Tense']))

Counter({1: 119105, 0: 42266})
Counter({'past': 89524, 'present': 71847})


In [28]:
tense_samples = {'present': 70000, 'past': 70000}
sentiment_samples = {1: 20000, 0: 35000}

# sample based on 'tense'
tense_sampled = pd.concat(
    [group.sample(n=tense_samples[tense], random_state=42) for tense, group in df_rest.groupby('Tense')]
).reset_index(drop=True)

# sample based on 'sentiment'
sampled_df = pd.concat(
    [group.sample(n=sentiment_samples[sentiment], random_state=42) for sentiment, group in tense_sampled.groupby('Sentiment analysis')]
).reset_index(drop=True)

In [29]:
df_new = pd.concat([df_part, sampled_df], axis=0).reset_index(drop=True)
df_new

Unnamed: 0,text,Number of symbols,Number of words,Parts of speech,Sentiment analysis,Tense
0,You can find it so easily . ”,29,8,"['PRON', 'AUX', 'VERB', 'PRON', 'ADV', 'ADV', ...",2,future
1,"not only in Germany , this should be equal in ...",73,15,"['PART', 'ADV', 'ADP', 'PROPN', 'PUNCT', 'PRON...",1,future
2,"The debate will soon be over , whether or not ...",89,18,"['DET', 'NOUN', 'AUX', 'ADV', 'AUX', 'ADV', 'P...",1,future
3,“ I do very well without any publishing of ‘ M...,137,30,"['PUNCT', 'PRON', 'VERB', 'ADV', 'ADV', 'ADP',...",2,past
4,"In a few years , it will be free , and I have ...",113,27,"['ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'PRON',...",2,future
...,...,...,...,...,...,...
111027,Keep in mind that such a force is already larg...,203,41,"['VERB', 'ADP', 'NOUN', 'SCONJ', 'DET', 'DET',...",1,present
111028,The Second Empire was very unusual in that it ...,98,19,"['DET', 'PROPN', 'PROPN', 'AUX', 'ADV', 'ADJ',...",1,past
111029,But I get a kick every time I see something li...,133,27,"['CCONJ', 'PRON', 'VERB', 'DET', 'NOUN', 'DET'...",1,past
111030,Since leading the Broncos to a snowy Sunday ni...,135,25,"['SCONJ', 'VERB', 'DET', 'PROPN', 'ADP', 'DET'...",1,present


In [30]:
print(Counter(df_new['Sentiment analysis']))
print(Counter(df_new['Tense']))

Counter({0: 40492, 1: 39511, 2: 31029})
Counter({'present': 42939, 'past': 38443, 'future': 29650})


## Create train & val partitions

In [31]:
ds_size = len(df_new)
n_val = 11032

# to ensure that we generate the same data 
rng = np.random.default_rng(42)

# create indexes
indexes_perm = rng.permutation(ds_size)
indexes = {'val': indexes_perm[:n_val],
           'train': indexes_perm[n_val:]}

In [20]:
keep_columns = ['text', 'Number of words', 'Parts of speech', 'Sentiment analysis', 'Tense']
keep_columns_new = ['text', 'words', 'pos', 'sentiment', 'tense']

In [41]:
# save all subsets
for ds_partition in ['train', 'val']:
  # create partition
  df_current = df_new.loc[indexes[ds_partition]][keep_columns].reset_index(drop=True)
  df_current.rename(columns={o:n for o,n in zip(keep_columns, keep_columns_new)}, inplace=True)

  # join into one string all pos
  df_current['pos'] = df_current['pos'].apply(lambda x: ' '.join(x[2:-2].split("', '")))

  # these what we expect
  expected_data_indexes = df_current.apply(lambda row: (len(row['text'].split(' ')) == len(row['pos'].split(' ')) == row['words']), axis=1)
  df_current = df_current[expected_data_indexes].reset_index(drop=True)

  # Replace the values in the 'sentiment' column
  sentiment_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
  df_current['sentiment'] = df_current['sentiment'].replace(sentiment_mapping)

  # # save
  # path_save = f'{path_data_save}/{ds_partition}.csv'
  # df_current.to_csv(path_save, index=False)  

In [39]:
df_current

Unnamed: 0,text,words,pos,sentiment,tense
0,"Earlier in his career , Ellison apologized for...",66,ADV ADP PRON NOUN PUNCT PROPN VERB ADP CCONJ V...,negative,past
1,The race has featured months of rancor between...,26,DET NOUN AUX VERB NOUN ADP NOUN ADP PROPN PUNC...,neutral,present
2,This is the wall that keeps many qualified peo...,15,PRON AUX DET NOUN PRON VERB ADJ ADJ NOUN ADP V...,negative,present
3,Ignore all the rest of the stuff for The Lost ...,23,VERB DET DET NOUN ADP DET NOUN ADP DET PROPN P...,negative,present
4,” 18 The move away from damaging authoritarian...,32,PUNCT NUM DET NOUN ADV ADP VERB NOUN ADP DET A...,neutral,present
...,...,...,...,...,...
99995,Rio may not be the best environment for the Ol...,17,PROPN AUX PART AUX DET ADJ NOUN ADP DET PROPN ...,negative,future
99996,While the independent unions were certainly re...,28,SCONJ DET ADJ NOUN AUX ADV VERB ADP DET PROPN ...,negative,past
99997,He naturally felt very passionate and took to ...,10,PRON ADV VERB ADV ADJ CCONJ VERB ADP NOUN PUNCT,positive,past
99998,“ With over $ 155 trillion of cross - border p...,47,PUNCT ADP ADP SYM NUM NUM ADP ADJ PUNCT NOUN N...,positive,present


In [40]:
print(Counter(df_current['sentiment']))
print(Counter(df_current['tense']))

Counter({'negative': 36488, 'neutral': 35635, 'positive': 27877})
Counter({'present': 38537, 'past': 34713, 'future': 26750})


In [42]:
print(Counter(df_current['sentiment']))
print(Counter(df_current['tense']))

Counter({'negative': 4004, 'neutral': 3876, 'positive': 3152})
Counter({'present': 4402, 'past': 3730, 'future': 2900})


## Load partitioned data

In [33]:
dataset = load_dataset("csv", 
                       data_files={"train": f'{path_data_save}/train.csv', 
                                   "val": f'{path_data_save}/val.csv'});

Found cached dataset csv (/home/andrii.liubonko/.cache/huggingface/datasets/csv/default-457e66cc73b296f1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [34]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'words', 'pos', 'sentiment', 'tense'],
        num_rows: 100000
    })
    val: Dataset({
        features: ['text', 'words', 'pos', 'sentiment', 'tense'],
        num_rows: 11032
    })
})

In [27]:
df = pd.DataFrame(dataset['val'])

In [29]:
df[df['words'] == 10]

Unnamed: 0,text,words,pos,sentiment,tense
60,Being a real good tester requires a specific m...,10,AUX DET ADJ ADJ NOUN VERB DET ADJ NOUN PUNCT,2,present
65,All of the trees were over 30 years old .,10,PRON ADP DET NOUN AUX ADP NUM NOUN ADJ PUNCT,1,past
79,and you can also stop reading shortly after th...,10,CCONJ PRON AUX ADV VERB VERB ADV ADP PRON PUNCT,0,future
93,But the Blues struggled after acquiring The Gr...,10,CCONJ DET PROPN VERB ADP VERB DET PROPN NUM PUNCT,0,past
236,Bogaard wants salmon restoration based on the ...,10,PROPN VERB NOUN NOUN VERB ADP DET ADJ NOUN PUNCT,1,present
...,...,...,...,...,...
10955,"During the delay , Batman gets on the horn .",10,ADP DET NOUN PUNCT PROPN VERB ADP DET NOUN PUNCT,1,present
10956,I said ' I do n't feel comfortable . ',10,PRON VERB PUNCT PRON AUX PART VERB ADJ PUNCT P...,0,past
11005,Retailers : This issue will ship with two cove...,10,NOUN PUNCT DET NOUN AUX VERB ADP NUM NOUN PUNCT,1,future
11011,Some of these asRNAs may afterward gain a func...,10,PRON ADP DET PROPN AUX ADV VERB DET NOUN PUNCT,1,future


In [112]:
samples = Counter(df[df['words'] == 12]['pos']).most_common()
#samples