In [1]:
from simpletransformers.classification import ClassificationModel
import numpy as np
import pandas as pd
from sklearn import preprocessing
from scipy import stats
import wandb


In [2]:
#subscribers data
daryo_n = [597660, 635395, 876932, 944683, 970976, 974959, 1001620, 987818, 955630, 930500]
kun_n = [866120, 897421, 1234502, 1327874, 1400304, 1419048, 1468400, 1467269, 1487835, 1533702]
qalampir_n = [84189, 91990, 132252, 141007, 162540, 167763, 182913, 186740, 185759, 183327]

view_period=7 # days 

In [3]:
def create_fields(data, source):
    dates = pd.DatetimeIndex(data['date'])
    n_views = data['num_views'].values

    labels = []
    day_of_week = []
    for i in range(0, len(dates)):

        daily_rate = n_views[i]/view_period
        average_daily_rate = int(source[dates.month[i]-1]/100) # scaling down it a bit

        labels.append(daily_rate/average_daily_rate)
        day_of_week.append(dates[i].dayofweek)
    
    return labels, day_of_week

In [30]:
daryo = pd.read_json("daryo_articles.jsonl", orient='columns', lines=False)
kun = pd.read_json("kun_articles.jsonl", orient='columns', lines=True)
qalampir = pd.read_json("qalampir_articles.jsonl", orient='columns', lines=True)

labels, day_of_week = create_fields(daryo, daryo_n)
    
daryo.insert(len(daryo.columns), "labels", labels, True) 
daryo.insert(len(daryo.columns)-1, "day_of_week", day_of_week, True)
daryo.insert(2, "source", len(daryo)*[1], True) 

labels, day_of_week = create_fields(kun, kun_n)
    
kun.insert(len(kun.columns), "labels", labels, True) 
kun.insert(len(kun.columns)-1, "day_of_week", day_of_week, True)
kun.insert(2, "source", len(kun)*[2], True) 

labels, day_of_week = create_fields(qalampir, qalampir_n)
    
qalampir.insert(len(qalampir.columns), "labels", labels, True) 
qalampir.insert(len(qalampir.columns)-1, "day_of_week", day_of_week, True)
qalampir.insert(2, "source", len(qalampir)*[3], True) 


In [31]:
# output[0:100].to_json('output.jsonl', orient='records', lines=True)

In [32]:
def _format(data):
    return pd.DataFrame({
        'text_a': '[CLS] ' + data['content'],
        'text_b': data['title'],
        'text_c': data['num_links'],
        'text_d': data['num_images'],
        'text_e': data['day_of_week'],
        'text_f': data['num_quotes'],
        'labels': data['labels']
    })

In [33]:
def scale(train_df, column):
    return (train_df[column]-train_df[column].min())/(train_df[column].max()-train_df[column].min())


In [34]:
# scale it
def scale_fields(train_df):
    train_df['text_c'] = scale(train_df, 'text_c')
    train_df['text_d'] = scale(train_df, 'text_d')
    train_df['text_e'] = scale(train_df, 'text_e')
    train_df['text_f'] = scale(train_df, 'text_f')
    train_df['labels'] = scale(train_df, 'labels')
    return train_df



In [35]:
daryo = scale_fields(_format(daryo))
kun = scale_fields(_format(kun))
qalampir = scale_fields(_format(qalampir))

train_df = [daryo, kun, qalampir]
train_df = pd.concat(train_df)

In [36]:
# train_df.sort_values(by=['labels'], ascending=False)
# train_df['labels'].mean()

In [37]:
from sklearn.model_selection import train_test_split
train_df = train_df.sample(frac=1)
train, test = train_test_split(train_df, test_size=0.2)

In [39]:
hyperparameter_defaults = dict(
    bs = 64,
    lr = 4e-5,
    epochs = 1,
    sequence_length = 256,
)

wandb.init(config= hyperparameter_defaults,project="duvduvgap")
config = wandb.config
wandb.config.experiment = "xlmroberta"

train_args={
    "output_dir": "outputs/xlm",
    "cache_dir": "cache/",
    "best_model_dir": "outputs/best_model/xlm",
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'num_train_epochs': config.epochs,
    'regression': True,
    'max_seq_length': config.sequence_length,
    'n_gpu': 3,
    "learning_rate": config.lr,
    "train_batch_size": config.bs,
    "eval_batch_size": config.bs,
}

# Create a ClassificationModel
model = ClassificationModel("xlmroberta", "xlm-roberta-large", num_labels=1, use_cuda=True, cuda_device=0, args=train_args)

# Train the model
model.train_model(train)

[34m[1mwandb[0m: Wandb version 0.10.7 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model chec

RuntimeError: CUDA out of memory. Tried to allocate 978.00 MiB (GPU 0; 31.75 GiB total capacity; 30.38 GiB already allocated; 81.19 MiB free; 30.50 GiB reserved in total by PyTorch)

In [26]:
result, model_outputs, wrong_predictions = model.eval_model(test)
test.insert(len(test.columns), "prediction", model_outputs, True) 
test = test.sort_values(by=['prediction'], ascending=False)

HBox(children=(FloatProgress(value=0.0, max=4942.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=155.0, style=ProgressStyle(descr…




In [27]:
print(result)

{'eval_loss': 0.004843581437824234}


In [28]:
def analysis(data):
    print(f"Correlation between labels and predictions: {stats.pearsonr(data['labels'], data['prediction'])}\n")
    print(f"Number of links and predictions: {stats.pearsonr(data['text_c'], data['prediction'])}\n")
    print(f"Number of images and predictions: {stats.pearsonr(data['text_d'], data['prediction'])}\n")
    print(f"Day of the week and predictions: {stats.pearsonr(data['text_e'], data['prediction'])}\n")
    print(f"Number of quotes and predictions: {stats.pearsonr(data['text_f'], data['prediction'])}\n")


In [29]:
analysis(test)

Correlation between labels and predictions: (0.06737550198291116, 2.1304025773193423e-06)

Number of links and predictions: (-0.05341946828949709, 0.00017190268995630225)

Number of images and predictions: (-0.0702293740201778, 7.735154678387084e-07)

Day of the week and predictions: (-0.0031420365413360324, 0.8252264954097338)

Number of quotes and predictions: (-0.12748373386952003, 2.3305734976435114e-19)

