In [1]:
!pip install transformers
!pip install simpletransformers
!pip install scikit-learn
!pip install wandb

!echo "--- Disk ---"
!df -h
!echo ""
!echo "--- CPU ---"
!cat /proc/cpuinfo
!echo ""
!echo "--- Memory ---"
!cat /proc/meminfo
!echo ""
!echo "--- GPU ---"
!nvidia-smi -L

# Mount Google Drive with input data
from google.colab import drive
drive.mount('/content/drive')

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.40.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->simpletransformers)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->

In [7]:
from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)
from sklearn.model_selection import train_test_split
import numpy as np
import sklearn.metrics as metrics
import pandas as pd
import random as rand
import wandb

# initialize for deterministic results
seed = 0
rand.seed(seed)

# load data
path = '/content/drive/My Drive/Colab Notebooks/Liter/correlations/correlationdata.csv'
data = pd.read_csv(path, sep = ',')
data = data.sample(frac=1, random_state=seed)
data.columns = ['dataid', 'datapath', 'nrrows', 'nrvals1', 'nrvals2',
                'type1', 'type2', 'column1', 'column2', 'method',
                'coefficient', 'pvalue', 'time']

# divide data into subsets
pearson = data[data['method']=='pearson']
spearman = data[data['method']=='spearman']
theilsu = data[data['method']=='theilsu']

# generate and print data statistics
nr_ps = len(pearson.index)
nr_sm = len(spearman.index)
nr_tu = len(theilsu.index)
print(f'#Samples for Pearson: {nr_ps}')
print(f'#Samples for Spearman: {nr_sm}')
print(f'#Samples for Theil\'s u: {nr_tu}')

# |coefficient>0.5| -> label 1
def coefficient_label(row):
  if abs(row['coefficient']) > 0.5:
    return 1
  else:
    return 0
pearson['label'] = pearson.apply(coefficient_label, axis=1)
spearman['label'] = spearman.apply(coefficient_label, axis=1)
theilsu['label'] = theilsu.apply(coefficient_label, axis=1)

rc_p = len(pearson[pearson['label']==1].index)/nr_ps
rc_s = len(spearman[spearman['label']==1].index)/nr_sm
rc_u = len(theilsu[theilsu['label']==1].index)/nr_tu
print(f'Ratio correlated - Pearson: {rc_p}')
print(f'Ratio correlated - Spearman: {rc_s}')
print(f'Ratio correlated - Theil\s u: {rc_u}')

# split data into training and test set
def def_split(data):
  x_train, x_test, y_train, y_test = train_test_split(
      pearson[['column1', 'column2']], pearson['label'],
      test_size=0.2, random_state=seed)
  train = pd.concat([x_train, y_train], axis=1)
  test = pd.concat([x_test, y_test], axis=1)
  return train, test

def ds_split(data):
  counts = data['dataid'].value_counts()
  print(f'Counts: {counts}')
  print(f'Count.index: {counts.index}')
  print(f'Count.index.values: {counts.index.values}')
  print(f'counts.shape: {counts.shape}')
  print(f'counts.iloc[0]: {counts.iloc[0]}')
  nr_vals = len(counts)
  nr_test_ds = int(nr_vals * 0.2)
  print(f'Nr. test data sets: {nr_test_ds}')
  ds_ids = counts.index.values.tolist()
  print(type(ds_ids))
  print(ds_ids)
  test_ds = rand.sample(ds_ids, nr_test_ds)
  print(f'TestDS: {test_ds}')
  def is_test(row):
    if row['dataid'] in test_ds:
      return True
    else:
      return False
  data['istest'] = data.apply(is_test, axis=1)
  train = data[data['istest'] == False]
  test = data[data['istest'] == True]
  print(f'train.shape: {train.shape}')
  print(f'test.shape: {test.shape}')
  print(train)
  print(test)
  return train[['column1', 'column2', 'label']], test[['column1', 'column2', 'label']]

train, test = ds_split(pearson)
train.columns = ['text_a', 'text_b', 'labels']
test.columns = ['text_a', 'text_b', 'labels']
print(train.head())
print(test.head())

output_dir='/content/drive/My Drive/Colab Notebooks/Liter/correlations/models'

model_args = ClassificationArgs(num_train_epochs=1, train_batch_size=40,
                                overwrite_output_dir=True, manual_seed=seed,
                                evaluate_during_training=True, no_save=False,
                                wandb_project='CorrelationPredictionv2',
                                output_dir=output_dir)
model = ClassificationModel("roberta", "roberta-base", weight=[1, 2],
                            use_cuda = True, args=model_args)
model.train_model(train_df=train, eval_df=test, acc=metrics.accuracy_score,
    rec=metrics.recall_score, pre=metrics.precision_score, f1=metrics.f1_score)
wandb.join()

#Samples for Pearson: 59935
#Samples for Spearman: 59935
#Samples for Theil's u: 119383


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pearson['label'] = pearson.apply(coefficient_label, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spearman['label'] = spearman.apply(coefficient_label, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  theilsu['label'] = theilsu.apply(coefficient_label, axis=1)


Ratio correlated - Pearson: 0.3370985234003504
Ratio correlated - Spearman: 0.35636940018353214
Ratio correlated - Theil\s u: 0.5734484809394973
Counts: dataid
3357    45
624     45
3034    45
4276    45
3430    45
        ..
1137     1
2839     1
2491     1
2421     1
864      1
Name: count, Length: 2764, dtype: int64
Count.index: Index([3357,  624, 3034, 4276, 3430,  466, 3602, 3355,   81,  605,
       ...
        309, 2840,  196, 1604, 1047, 1137, 2839, 2491, 2421,  864],
      dtype='int64', name='dataid', length=2764)
Count.index.values: [3357  624 3034 ... 2491 2421  864]
counts.shape: (2764,)
counts.iloc[0]: 45
Nr. test data sets: 552
<class 'list'>
[3357, 624, 3034, 4276, 3430, 466, 3602, 3355, 81, 605, 3567, 2201, 3604, 3469, 504, 3477, 2271, 3588, 3465, 3409, 2598, 468, 3525, 3394, 3563, 3504, 460, 2195, 1347, 505, 475, 614, 3375, 562, 3565, 470, 583, 3382, 2604, 3496, 618, 3489, 3368, 2751, 592, 1138, 464, 502, 3310, 582, 3423, 3361, 3581, 3490, 3315, 575, 424, 436, 2720, 39

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['istest'] = data.apply(is_test, axis=1)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train.shape: (47748, 15)
test.shape: (12187, 15)
        dataid                                           datapath  nrrows  \
231947    4120                   ../data/DS175/column_2C_weka.csv     310   
16651      418                 ../data/DS1654/station_vitoria.csv      59   
40486      611   ../data/DS1064/2020/10/2020.10.19/2020.10.19.csv  988965   
136744    2700                   ../data/DS1652/telecom_churn.csv    3333   
236252    4207  ../data/DS718/Forbes Richest Atheletes (Forbes...     277   
...        ...                                                ...     ...   
108631    2089  ../data/DS1144/MoviesOnStreamingPlatforms_upda...    3371   
17089      421                  ../data/DS1654/station_macapa.csv      53   
52620      832                   ../data/DS1635/UniversalBank.csv    5000   
112420    2165                             ../data/DS204/data.csv     569   
122579    2403  ../data/DS1598/election-2020-01-11/tweet_activ...      31   

        nrvals1  nrvals2  

  0%|          | 0/95 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Currently logged in as: [33mvincent-konstantin-kapp[0m ([33mvincent-konstantin-kapp-university-of-hamburg[0m). Use [1m`wandb login --relogin`[0m to force relogin


  scaler = amp.GradScaler()


Running Epoch 1 of 1:   0%|          | 0/1194 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/24 [00:00<?, ?it/s]

  with amp.autocast():


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Training loss,▇▅█▆▃▇▅▆▂▃▁▆▃▂▅▃▂▃▃▇▂▆▅
acc,▁
accuracy,▁
auprc,▁
auroc,▁
eval_loss,▁
f1,▁
f1_score,▁
fn,▁
fp,▁

0,1
Training loss,0.48007
acc,0.7764
accuracy,0.7764
auprc,0.78939
auroc,0.86385
eval_loss,0.47385
f1,0.70377
f1_score,0.7621
fn,929.0
fp,1796.0


In [9]:
import torch
from simpletransformers.classification import ClassificationModel
import sklearn.metrics as metrics

model = ClassificationModel('roberta', '/content/drive/My Drive/Colab Notebooks/Liter/correlations/models/checkpoint-1194-epoch-1')
result, outputs, failures = model.eval_model(
    test, acc=metrics.accuracy_score, rec=metrics.recall_score,
    pre=metrics.precision_score, f1=metrics.f1_score)
print(result)
test_samples = []
for idx, r in test.iterrows():
  test_samples.append([r['text_a'], r['text_b']])
pred = model.predict(test_samples)
test['pred'] = pred[0]
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 1000)
print(test)

  0%|          | 0/24 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/122 [00:00<?, ?it/s]

  with amp.autocast():


{'mcc': 0.5328235424439259, 'accuracy': 0.7764010831213588, 'f1_score': 0.7621002426128618, 'tp': 3237, 'tn': 6225, 'fp': 1796, 'fn': 929, 'auroc': 0.8638494738637049, 'auprc': 0.7893880077443259, 'acc': 0.7764010831213588, 'rec': 0.7770043206913106, 'pre': 0.6431551758394596, 'f1': 0.7037721491466463, 'eval_loss': 0.45708126538112515}


  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/122 [00:00<?, ?it/s]

              text_a       text_b  labels  pred
227113             4            2       1     1
227179             7            5       1     1
186517           0.8          0.5       0     0
238308         Close          Low       1     1
211359  all_suicides   Unnamed: 0       0     0
...              ...          ...     ...   ...
122144   engagements  impressions       1     1
179026           0.6          0.5       0     1
168691           0.9          0.4       0     0
170584           0.9          0.5       0     0
211543   vet_males_p    vet_pop_p       0     0

[12187 rows x 4 columns]


In [11]:
print(result)

{'mcc': 0.5328235424439259, 'accuracy': 0.7764010831213588, 'f1_score': 0.7621002426128618, 'tp': 3237, 'tn': 6225, 'fp': 1796, 'fn': 929, 'auroc': 0.8638494738637049, 'auprc': 0.7893880077443259, 'acc': 0.7764010831213588, 'rec': 0.7770043206913106, 'pre': 0.6431551758394596, 'f1': 0.7037721491466463, 'eval_loss': 0.45708126538112515}


In [12]:
print(test)

              text_a       text_b  labels  pred
227113             4            2       1     1
227179             7            5       1     1
186517           0.8          0.5       0     0
238308         Close          Low       1     1
211359  all_suicides   Unnamed: 0       0     0
...              ...          ...     ...   ...
122144   engagements  impressions       1     1
179026           0.6          0.5       0     1
168691           0.9          0.4       0     0
170584           0.9          0.5       0     0
211543   vet_males_p    vet_pop_p       0     0

[12187 rows x 4 columns]
