In [1]:
# .\venv\Scripts\activate
# pip list : list out all Python installed packages along with their versions
# | redirect the output of the left command as the input to the right command
# grep searches for pattern in text -> findstr is the Windows equivalent of grep
# pip list | grep tweetnlp effectively lists all installed Python packages and filters out only those related to “tweetnlp”

# pip install --upgrade google-cloud-storage
# pip install tweetnl
# pip list | findstr tweetnlp

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Since loading all packages that may be unnecessary would consume lots of memory and slow down your script, you need to explicitly import packages that you want to use in your script 
import tweetnlp

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Topic Classification ->  assign topics related to its content

## Multi-label

model = tweetnlp.load_model('topic_classification')  # Or `model = tweetnlp.TopicClassification()`
model.topic("SK Telecom and Comcast Spectacor Announce Global Esports Joint Venture.")  # Or `model.predict`




{'label': ['business_&_entrepreneurs', 'science_&_technology']}

In [4]:
# Note: the probability of the multi-label model is the output of sigmoid function on binary prediction whether each topic is positive or negative.
model.topic("T1 removes Polt from League head coach position, promotes Bengi to interim ahead of Worlds 2022.", return_probability=True)

{'label': ['sports'],
 'probability': {'arts_&_culture': 0.003120152512565255,
  'business_&_entrepreneurs': 0.004956244956701994,
  'celebrity_&_pop_culture': 0.02309464104473591,
  'diaries_&_daily_life': 0.01539651770144701,
  'family': 0.004469731356948614,
  'fashion_&_style': 0.002894326113164425,
  'film_tv_&_video': 0.013332835398614407,
  'fitness_&_health': 0.007021516561508179,
  'food_&_dining': 0.0032630572095513344,
  'gaming': 0.014031173661351204,
  'learning_&_educational': 0.0028917910531163216,
  'music': 0.005384225398302078,
  'news_&_social_concern': 0.07663474231958389,
  'other_hobbies': 0.003516028169542551,
  'relationships': 0.003337266854941845,
  'science_&_technology': 0.0038626682944595814,
  'sports': 0.9817806482315063,
  'travel_&_adventure': 0.0034862724132835865,
  'youth_&_student_life': 0.0031163133680820465}}

In [7]:
## Single-label model

model = tweetnlp.load_model('topic_classification', multi_label=False)  # Or `model = tweetnlp.TopicClassification(multi_label=False)`
model.topic("Jacob Collier is a Grammy-awarded English artist from London.")

{'label': 'pop_culture'}

In [9]:
# NOTE: the probability of the sinlge-label model the softmax over the label.
model.topic("Want to meet T1 in person this year? Here’s how you can enter the special lucky draw.", return_probability=True)

{'label': 'sports_&_gaming',
 'probability': {'arts_&_culture': 0.00021540890156757087,
  'business_&_entrepreneurs': 7.853414717828855e-05,
  'pop_culture': 0.001070138649083674,
  'daily_life': 0.0002443432167638093,
  'sports_&_gaming': 0.9983493089675903,
  'science_&_technology': 4.225688462611288e-05}}

In [11]:
# Why is this wrong ?
dataset_multi_label, label2id_multi_label = tweetnlp.load_dataset('topic_classification')
dataset_single_label, label2id_single_label = tweetnlp.load_dataset('topic_classification', multi_label=False)

ReadTimeout: (ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=None)"), '(Request ID: 142ba3c4-7832-4419-94b2-8ce7603e4565)')

In [16]:
# Sentiment Analysis 

## English model 

model = tweetnlp.load_model('sentiment')  # Or `model = tweetnlp.Sentiment()`
model.sentiment("Faker lost last year, but finally reclaims this esteemed title at The Game Awards 2023")  # Or `model.predict`



Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'label': 'neutral'}

In [18]:
dataset, label2id = tweetnlp.load_dataset('sentiment')




In [22]:
dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [24]:
label2id

{'negative': 0, 'neutral': 1, 'positive': 2}

In [30]:
# Emotion Detection 

model = tweetnlp.load_model('emotion')  # Or `model = tweetnlp.Emotion()`
model.emotion('T1 lost domestic titles three times but won Worlds five times.')  # Or `model.predict`

{'label': 'optimism'}

In [31]:
# Fine-tuning Language Model with TweetNLP

import logging
import tweetnlp
from pprint import pprint

logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')

# an examples for model prediction
sample = [
    "How many more days until opening day? 😩"
    "All two of them taste like ass.",
    "If you wanna look like a badass, have drama on social media",
    "Whoever just unfollowed me you a bitch",
    "I love swimming for the same reason I love meditating...the feeling of weightlessness.",
    "Beautiful sunset last night from the pontoon @ Tupper Lake, New York",
    'Jacob Collier is a Grammy-awarded English artist from London.'
]

# set language model and task
language_model = 'cardiffnlp/twitter-roberta-base-2021-124m'
task = "irony"

# load dataset
dataset, label_to_id = tweetnlp.load_dataset(task)

# load trainer
trainer_class = tweetnlp.load_trainer(task)

# define trainer
trainer = trainer_class(
    language_model=language_model,
    dataset=dataset,
    label_to_id=label_to_id,
    max_length=128,
    split_train='train',
    split_test='test',
    output_dir=f'model_ckpt/test'
)

Downloading data: 100%|██████████| 183k/183k [00:03<00:00, 60.2kB/s]
Downloading data: 100%|██████████| 54.0k/54.0k [00:02<00:00, 26.3kB/s]
Downloading data: 100%|██████████| 61.1k/61.1k [00:02<00:00, 25.1kB/s]
Generating train split: 100%|██████████| 2862/2862 [00:00<00:00, 42045.73 examples/s]
Generating test split: 100%|██████████| 784/784 [00:00<00:00, 165976.90 examples/s]
Generating validation split: 100%|██████████| 955/955 [00:00<00:00, 258857.46 examples/s]


AttributeError: module 'tweetnlp' has no attribute 'load_trainer'