# Predictions - subject model ⚠ Use Python 3.9.13 64-bit kernel

In [1]:
from transformers import pipeline
from transformers import (AutoModelForSequenceClassification, AutoTokenizer)
from alive_progress import alive_bar
import pandas as pd
from time import sleep

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "dccuchile/bert-base-spanish-wwm-cased"
model_tunned = AutoModelForSequenceClassification.from_pretrained("./model2_subject/bert_wwm/", num_labels=4)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
pipe = pipeline("text-classification", model=model_tunned, tokenizer=tokenizer)

pipe("veneco")

[{'label': 'migrants', 'score': 0.9996597766876221}]

In [4]:
y_in = pd.read_csv('./../data/texts/colombian_valid_tweets.csv')
print(len(y_in))

1664903


In [5]:
predictions = []

### Checking previous predictions ⚠

In [5]:
y_prev_predicted = pd.read_csv('./../data/texts/colombian_valid_tweets_subject_predictions_bert_wwm.csv')
print(len(y_prev_predicted))

1500002


In [None]:
#checking previous predictions
with alive_bar(len(y_prev_predicted), force_tty = True) as bar:
  print("Checking tweets already predicted")
  y_prev_predicted.reset_index(drop=True, inplace=True)
  for index,serie in y_prev_predicted.iterrows():
    if y_in.loc[index]['Id'] != y_prev_predicted.loc[index]['Id'] :
      y_prev_predicted.drop(range(index), inplace=True)
      y_prev_predicted.reset_index(drop=True, inplace=True)
      print("Error found on:",index, "deleted, please run again")
      break
    bar()


In [6]:
y_in.drop(range(len(y_prev_predicted)), inplace=True)
predictions = y_prev_predicted.to_dict('records')

### Predict ✔

In [7]:
with alive_bar(len(y_in), force_tty = True) as bar:
  print("Starting to predict")

  for index,serie in y_in.iterrows():
    try:
      referred_to = pipe(serie['text'])[0]['label']
    except Exception as r:
      referred_to = "ERROR"
    predictions.append({'Id':serie['Id'], 'text':serie['text'], 'date':serie['date'],
                        'referred_to':referred_to})

    if bar.current() % 20000 == 0 and bar.current() != 0: #export every n tweets → checkpoint
      pd.DataFrame.from_dict(predictions).to_csv('./../data/texts/colombian_v alid_tweets_subject_predictions_bert_wwm.csv', index=False)
      print("Checkpoint saved, sleeping for 1 minute")
      sleep(60)# sleep for 60 seconds to avoid my pc exploding
      
    bar()
print("Finished the predictions")
pd.DataFrame.from_dict(predictions).to_csv('./../data/texts/colombian_valid_tweets_subject_predictions_bert_wwm.csv', index=False)

on 0: Starting to predict                                                                                               
on 20000: Checkpoint saved, sleeping for 1 minute                                                                       
on 40000: Checkpoint saved, sleeping for 1 minute                                                                       
on 60000: Checkpoint saved, sleeping for 1 minute                                                                       
on 80000: Checkpoint saved, sleeping for 1 minute                                                                       
on 100000: Checkpoint saved, sleeping for 1 minute                                                                      
on 120000: Checkpoint saved, sleeping for 1 minute                                                                      
on 140000: Checkpoint saved, sleeping for 1 minute                                                                      
on 160000: Checkpoint saved, sle

# Predictions - tone model

In [None]:
from transformers import pipeline
from transformers import (AutoModelForSequenceClassification, AutoTokenizer)
from alive_progress import alive_bar
import pandas as pd
from time import sleep

In [None]:
model_name = "cardiffnlp/twitter-roberta-base-2021-124m"
model_tunned = AutoModelForSequenceClassification.from_pretrained("test_trainer_tone/checkpoint-500/", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
pipe = pipeline("text-classification", model=model_tunned, tokenizer=tokenizer)

pipe("veneco")

In [None]:
y_in = pd.read_csv('./../data/colombian_valid_tweets_predictions.csv')
print(len(y_in))

In [None]:
predictions = []

### Checking previous predictions ⚠

In [None]:
y_prev_predicted = pd.read_csv('./../data/colombian_valid_tweets_tone_predictions.csv')
print(len(y_prev_predicted))

In [None]:
#checking previous predictions
with alive_bar(len(y_prev_predicted), force_tty = True) as bar:
  print("Checking tweets already predicted")
  y_prev_predicted.reset_index(drop=True, inplace=True)
  for index,serie in y_prev_predicted.iterrows():
    if y_in.loc[index]['Id'] != y_prev_predicted.loc[index]['Id'] :
      y_prev_predicted.drop(range(index), inplace=True)
      y_prev_predicted.reset_index(drop=True, inplace=True)
      print("Error found on:",index, "deleted, please run again")
      break
    bar()


In [None]:
y_in.drop(range(len(y_prev_predicted)), inplace=True)
predictions = y_prev_predicted.to_dict('records')

### Predict ✔

In [None]:
with alive_bar(len(y_in), force_tty = True) as bar:
  print("Starting to predict")

  for index,serie in y_in.iterrows():
    try:
      tone_scale = pipe(serie['text'])[0]['label']
    except Exception as r:
      tone_scale = "ERROR"
    predictions.append({'Id':serie['Id'], 'text':serie['text'], 'date':serie['date'],
                        'referred_to':serie['referred_to'], 'tone_str':tone_scale})

    if bar.current() % 20000 == 0 and bar.current() != 0: #export every n tweets → checkpoint
      pd.DataFrame.from_dict(predictions).to_csv('./../data/colombian_valid_tweets_tone_predictions.csv', index=False)
      print("Checkpoint saved, sleeping for 1 minute")
      sleep(60)# sleep for 60 seconds to avoid my pc exploding
      
    bar()
print("Finished the predictions")
pd.DataFrame.from_dict(predictions).to_csv('./../data/colombian_valid_tweets_tone_predictions.csv', index=False)

# Predictions - negativity model

In [1]:
from transformers import pipeline
from transformers import (AutoModelForSequenceClassification, AutoTokenizer)
from alive_progress import alive_bar
import pandas as pd
from time import sleep

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "cardiffnlp/twitter-roberta-base-2021-124m"
model_tunned = AutoModelForSequenceClassification.from_pretrained("test_trainer_negativity/", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
pipe = pipeline("text-classification", model=model_tunned, tokenizer=tokenizer)

pipe("veneco")

[{'label': 'LABEL_2', 'score': 0.8630983233451843}]

In [4]:
labels_dict = {'LABEL_0':-3, 'LABEL_1':-2, 'LABEL_2':-1, 'ERROR':'ERROR'}
tones_dict = {'LABEL_0':'negative', 'LABEL_1':'positive', 'LABEL_2':'neutral', 'ERROR':'ERROR'}

In [5]:
y_in = pd.read_csv('./../data/colombian_valid_tweets_tone_predictions.csv')
print(len(y_in))

1485921


In [6]:
predictions = []

### Checking previous predictions ⚠

In [7]:
y_prev_predicted = pd.read_csv('./../data/colombian_valid_tweets_negativity_predictions.csv')
print(len(y_prev_predicted))

1485921


In [None]:
#checking previous predictions
with alive_bar(len(y_prev_predicted), force_tty = True) as bar:
  print("Checking tweets already predicted")
  y_prev_predicted.reset_index(drop=True, inplace=True)
  for index,serie in y_prev_predicted.iterrows():
    if y_in.loc[index]['Id'] != y_prev_predicted.loc[index]['Id'] :
      y_prev_predicted.drop(range(index), inplace=True)
      y_prev_predicted.reset_index(drop=True, inplace=True)
      print("Error found on:",index, "deleted, please run again")
      break
    bar()


In [8]:
y_in.drop(range(len(y_prev_predicted)), inplace=True)
predictions = y_prev_predicted.to_dict('records')

### Predict ✔

In [9]:
with alive_bar(len(y_in)-1, force_tty = True) as bar:
  print("Starting to predict")

  for index,serie in y_in.iterrows():
    if serie['tone_str'] == 'LABEL_0':
      try:
        tone_scale = labels_dict[pipe(serie['text'])[0]['label']]
      except:
        tone_scale = "ERROR"
    else:
      tone_scale = "not_negative"
      
    predictions.append({'Id':serie['Id'], 'text':serie['text'], 'date':serie['date'],
                          'referred_to':serie['referred_to'], 'tone_str':tones_dict[serie['tone_str']],
                          'negativity':tone_scale})


    if bar.current() % 20000 == 0 and bar.current() != 0: #export every n tweets → checkpoint
      pd.DataFrame.from_dict(predictions).to_csv('./../data/colombian_valid_tweets_negativity_predictions.csv', index=False)
      print("Checkpoint saved, sleeping for 1 minute")
      sleep(60)# sleep for 60 seconds to avoid my pc exploding
      
    bar()
print("Finished the predictions")
pd.DataFrame.from_dict(predictions).to_csv('./../data/colombian_valid_tweets_negativity_predictions.csv', index=False)

on 0: Starting to predict                                                                                               
on 20000: Checkpoint saved, sleeping for 1 minute                                                                       
on 40000: Checkpoint saved, sleeping for 1 minute                                                                       
on 60000: Checkpoint saved, sleeping for 1 minute                                                                       
on 80000: Checkpoint saved, sleeping for 1 minute                                                                       
on 100000: Checkpoint saved, sleeping for 1 minute                                                                      
|████████████████████████████████████████✗︎ (!) 115537/115536 [100%] in 1:46:30.3 (18.08/s)                              
Finished the predictions
