In [20]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [31]:
#first we install important libraries
!pip install huggingface_hub transformers datasets gradio pipreqs TextBlob emot xformers
!pip install accelerate>=0.20.1
!pip install transformers[torch] accelerate -U



In [32]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root

In [33]:
# Import libraries
import os
import uuid
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from scipy.special import softmax
import gradio as gr

from google.colab import drive
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import IntervalStrategy
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback
from transformers import pipeline
from transformers import TrainingArguments
from transformers import Trainer
from torch import nn




In [34]:
# Disabe W&B
os.environ["WANDB_DISABLED"] = "true"

In [35]:
# Load the dataset and display some values

# Load the CSV file into a DataFrame

url = "https://github.com/Azubi-Africa/Career_Accelerator_P5-NLP/raw/master/zindi_challenge/data/Train.csv"

df = pd.read_csv(url)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


In [37]:
# Select rows with missing values
df.isnull().sum()

tweet_id     0
safe_text    0
label        1
agreement    2
dtype: int64

In [38]:
# Select rows with missing values
df[df.isnull().any(axis=1)]

Unnamed: 0,tweet_id,safe_text,label,agreement
4798,RQMQ0L2A,#lawandorderSVU,,
4799,I cannot believe in this day and age some pare...,1,0.666667,


In [39]:
# Extract complete text from 'safe_text' column
complete_text = df.iloc[4798]['safe_text']
complete_text

'#lawandorderSVU '

In [40]:
# Select row by index and assign values to columns
df.loc[4798, 'label'] = 0
df.loc[4798, 'agreement'] = 0.666667

# Use .iloc[] and .iat[] to select and update safe_text column
df.iloc[4798, df.columns.get_loc('safe_text')] = complete_text


In [41]:
# Generate random UUID string for tweet_id
'''UUIDs are often used in software applications for various purposes such as generating unique IDs for entities,
tracking unique user sessions, or creating unique file names'''
rand_tweet_id = str(uuid.uuid4())

# Select row by index and assign values to columns
row_index = 4799
df.loc[row_index, 'tweet_id'] = rand_tweet_id
df.loc[row_index, 'label'] = 1
df.loc[row_index, 'agreement'] = 0.666667

# Use .iloc[] and .iat[] to select and update safe_text column
df.iloc[row_index, df.columns.get_loc('safe_text')] = df.iloc[row_index, 1]


In [42]:
df[df.duplicated()].sum()

tweet_id     0.0
safe_text    0.0
label        0.0
agreement    0.0
dtype: float64

### Handling Imbalance Data

In [43]:
df['label'].value_counts()

 0.0    4909
 1.0    4054
-1.0    1038
Name: label, dtype: int64

In [44]:
# Find the maximum count among all classes
max_class_count = df['label'].value_counts().max()

# Group the dataframe by 'label'
grouped = df.groupby('label')

# Sample each group to match the max_class_count
balanced_df = grouped.apply(lambda x: x.sample(max_class_count, replace=True)).reset_index(drop=True)

# The 'balanced_df' now contains an equal number of instances for each class
print(balanced_df['label'].value_counts())


-1.0    4909
 0.0    4909
 1.0    4909
Name: label, dtype: int64


In [45]:
# Split the train data => {train, eval}
train, eval = train_test_split(balanced_df, test_size=0.2, random_state=42)

In [46]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11781 entries, 2435 to 7270
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   11781 non-null  object 
 1   safe_text  11781 non-null  object 
 2   label      11781 non-null  float64
 3   agreement  11781 non-null  float64
dtypes: float64(2), object(2)
memory usage: 460.2+ KB


In [47]:
eval.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2946 entries, 13730 to 9863
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   2946 non-null   object 
 1   safe_text  2946 non-null   object 
 2   label      2946 non-null   float64
 3   agreement  2946 non-null   float64
dtypes: float64(2), object(2)
memory usage: 115.1+ KB


In [48]:
# Save splitted subsets

# Define file path

file_path = '/content/drive/MyDrive/NLP/NLP 2/M2'

#"/content/drive/MyDrive/NLP/NLP_2/M2"

train.to_csv(os.path.join(file_path, "train_subset.csv"), index=False)
eval.to_csv(os.path.join(file_path, "eval_subset.csv"), index=False)

In [49]:
# Load the CSV files into a dataset

dataset = load_dataset('csv', data_files={
    'train': '/content/drive/MyDrive/NLP/NLP 2/M2/train_subset.csv',
    'eval': '/content/drive/MyDrive/NLP/NLP 2/M2/eval_subset.csv'
}, encoding='ISO-8859-1')

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-19065bca40c5a05f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-19065bca40c5a05f/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [50]:
#use hub_model_id="finetuned-Sentiment-classfication-ROBERTA-model
#use hub_model_id="finetuned-Sentiment-classfication-BERT-model
#use hub_model_id="finetuned-Sentiment-classfication-DISTILBERT-model
#use hub_model_id="finetuned-Sentiment-classfication-XLM-model

In [52]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',                          # Directory where the model checkpoints and evaluation results will be stored
    evaluation_strategy=IntervalStrategy.STEPS,      # Interval for evaluating the model during training (every specified number of steps)
    save_strategy=IntervalStrategy.STEPS,            # Interval for saving the model during training (every specified number of steps)
    save_steps=500,                                  # Number of steps between two saves
    load_best_model_at_end=True,                     # Whether to load the best model at the end of training
    num_train_epochs=4,                              # Number of training epochs
    per_device_train_batch_size=4,                   # Batch size per GPU for training
    per_device_eval_batch_size=4,                    # Batch size per GPU for evaluation
    learning_rate=3e-5,                              # Learning rate
    weight_decay=0.01,                               # Weight decay
    warmup_steps=500,                                # Number of warmup steps
    logging_steps=500,                               # Number of steps between two logs
    fp16=True,                                       # Whether to use 16-bit precision
    gradient_accumulation_steps=4,                  # Number of steps to accumulate gradients before performing an optimizer step
    dataloader_num_workers=2,                        # Number of workers to use for loading data
    push_to_hub=True,                                # Whether to push the model checkpoints to the Hugging Face hub
    hub_model_id="wachirachris4/finetuned-Sentiment-classfication-ROBERTA-model",  # Model ID to use when pushing the model to the Hugging Face hub
)



# Define the early stopping callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,                       # Number of epochs with no improvement before stopping training
    early_stopping_threshold=0.01,                   # Minimum improvement in the metric for considering an improvement
)

# Combine the training arguments and the early stopping callback
training_args.callbacks = [early_stopping]


ImportError: ignored

In [None]:
tokenizer_ROBERTA = AutoTokenizer.from_pretrained('roberta-base')
'''
This code instantiates a tokenizer for the BERT (Bidirectional Encoder Representations from Transformers)
pre-trained model with the bert-base-cased configuration.

'''


In [None]:
# Define a function to transform the label values
def transform_labels(label):
    # Extract the label value
    label = label['label']
    # Map the label value to an integer value
    num = 0
    if label == -1: #'Negative'
        num = 0
    elif label == 0: #'Neutral'
        num = 1
    elif label == 1: #'Positive'
        num = 2
    # Return a dictionary with a single key-value pair
    return {'labels': num}

# Define a function to tokenize the text data
def tokenize_data(example):
    # Extract the 'safe_text' value from the input example and tokenize it
    return tokenizer_ROBERTA(example['safe_text'], padding='max_length')

# Apply the transformation functions to the dataset using the 'map' method
# This transforms the label values and tokenizes the text data
dataset_out = dataset.map(transform_labels)

dataset_ROBERTA = dataset_out.map(tokenize_data, batched=True)

# Define a list of column names to remove from the dataset
remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement']

# Apply the 'transform_labels' function to the dataset to transform the label values
# Also remove the columns specified in 'remove_columns'

dataset_ROBERTA = dataset_ROBERTA.map(transform_labels, remove_columns=remove_columns)

In [None]:
# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model_ROBERTA = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

In [None]:
train_dataset_ROBERTA = dataset_ROBERTA['train'].shuffle(seed=10) #.select(range(40000)) # to select a part


In [None]:
eval_dataset_ROBERTA = dataset_ROBERTA['eval'].shuffle(seed=10)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    rmse = np.sqrt(np.mean((predictions - labels)**2))
    return {"rmse": rmse}


In [None]:
trainer_ROBERTA = Trainer(
    model=model_ROBERTA,
    args=training_args,
    train_dataset=train_dataset_ROBERTA,
    eval_dataset=eval_dataset_ROBERTA,
    compute_metrics=compute_metrics    # Add this line to define the compute_metrics function
)

In [None]:
trainer_ROBERTA.train()

In [None]:
# Evaluate the model
eval_results = trainer_ROBERTA.evaluate()

# Create a dictionary of the evaluation results
results_dict = {
    "Model": "roberta-base",
    "Loss": eval_results["eval_loss"],
    "RMSE": eval_results["eval_rmse"],
    "Runtime": eval_results["eval_runtime"],
    "Samples Per Second": eval_results["eval_samples_per_second"],
    "Steps Per Second": eval_results["eval_steps_per_second"],
    "Epoch": eval_results["epoch"]
}

# Create a pandas DataFrame from the dictionary
results_df = pd.DataFrame([results_dict])

# Sort the results by "eval_rmse" in ascending order and get the name and state dict of the best model
best_model = results_df.loc[results_df['RMSE'].idxmin()]

print(best_model)


In [None]:
 # Push the final fine-tuned model to the Hugging Face model hub

trainer_ROBERTA.push_to_hub("wachirachris4/finetuned-Sentiment-classfication-ROBERTA-model")

In [None]:
tokenizer_ROBERTA.push_to_hub("wachirachris4/finetuned-Sentiment-classfication-ROBERTA-model")

In [None]:
model_ROBERTA.push_to_hub("wachirachris4/finetuned-Sentiment-classfication-ROBERTA-model")

In [None]:
# Load the tokenizer
tokenizer = tokenizer_ROBERTA.from_pretrained("wachirachris4/finetuned-Sentiment-classfication-ROBERTA-model")

# Load the fine-tuned model
model = pipeline("text-classification", model="wachirachris4/finetuned-Sentiment-classfication-ROBERTA-model", tokenizer=tokenizer)


In [None]:
label_map = {0: "negative", 1: "neutral", 2: "positive"}

# Make predictions on some example text
result = model("I love these covid vaccines.")

# Map the numerical label to the corresponding class name
result[0]["label"] = label_map[int(result[0]["label"].split("_")[1])]

# Print the predicted label and score
print(result)

In [None]:
!pip freeze > /content/drive/MyDrive/NLP/NLP_2/M2/requirements.txt

In [None]:
!ls {file_path}