<a href="https://colab.research.google.com/github/OziomaEunice/Sentiment_GPT/blob/develop2/LLaMA_SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LLaMA For Sentiment Analysis**

In [None]:
! nvcc --version

## **Huggingface Login**

In [None]:
!huggingface-cli login

## **Installing Bitsandbytes, Transformers, Accelerate, and Peft Libraries for LLaMA**

In [None]:
# install bitsandbytes library for its low usage of memory and prevent the system from crashing.
# install other libraries


#! pip install --upgrade bitsandbytes
#!pip install -q -U "transformers==4.36.2" "datasets==2.16.1" "accelerate==0.26.1" "bitsandbytes==0.42.0"
#! pip install -i https://test.pypi.org/simple/ bitsandbytes --upgrade
! pip uninstall bitsandbytes
! pip install bitsandbytes==0.38.1
! pip install -q -U "torch==2.0.0" tensorboard
! pip install git+https://github.com/TimDettmers/bitsandbytes.git
! pip install --upgrade transformers accelerate
! pip install dataset trl
! pip install -q -U git+https://github.com/huggingface/peft.git

In [None]:
! pip show bitsandbytes

In [None]:
! pip uninstall bitsandbytes
! pip install bitsandbytes==0.38.1

## **Importing Libraries**

In [6]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!apt-get update
!apt-get install cuda-toolkit-11-8
import os
os.environ["LD_LIBRARY_PATH"] += ":" + "/usr/local/cuda-11/lib64"
os.environ["LD_LIBRARY_PATH"] += ":" + "/usr/local/cuda-11.8/lib64"

In [9]:
# import os
# os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-11.8/lib64:' + os.environ.get('LD_LIBRARY_PATH', '')

In [10]:
# !rm -r /usr/local/cuda
# !rm -r /usr/local/cuda-12.2
# ! rm -r /usr/local/cuda-12
# !rm -r /usr/local/cuda-11

In [11]:
# ! make CUDA_VERSION=DETECTED_CUDA_VERSION
# ! make CUDA_VERSION=118

In [None]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import torch
import torch.nn as nn
import transformers
import bitsandbytes as bnb
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer, setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [21]:
import nltk

In [None]:
# Download the stopwords dataset
nltk.download('stopwords')

# Download wordnet dataset
nltk.download('wordnet')

# Download punkt dataset
nltk.download('punkt')

# Get the set of English stop words
stop_words = set(stopwords.words('english'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [23]:
print(f"pytorch version {torch.__version__}")

pytorch version 2.0.0+cu117


In [24]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


## **Loading IMDb Dataset**

In [28]:
# since dataset is imported to my Google Colab (which will remain in here temporary)
# read excel file
df = pd.read_excel('/content/Tweets.xlsx') # for Twitter dataset
df2 = pd.read_excel('/content/IMDB_Dataset.xlsx') # for IMDb Movie Review dataset
# df = pd.read_excel('/content/Tweets.xlsx', names = ["text", "airline_sentiment"]) # for Twitter dataset
# df2 = pd.read_excel('/content/IMDB_Dataset.xlsx', names = ["review", "sentiment"]) # for IMDb Movie Review dataset

## **Preprocessing Dataset**

In [29]:
# drop columns that are not needed for processing data
# In this case, for the Twitter dataset
df = df.drop(columns=['tweet_id', 'airline_sentiment_gold', 'negativereason', 'negativereason_confidence', 'negativereason_gold', 'retweet_count', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone'])

### **Cleaning Dataset**

In [30]:
# clean (preprocess) the Twitter dataset
def cleanData1(text, min_word_length = 3):
  text = text.lower()
  text = re.sub(r'@[A-Za-z0-9]+', "", text) # this informs Python the the mentions in text must be substituted with an empty string
  text = re.sub(r'#', "", text) # removing #
  text = re.sub(r'RT[\s]+', "", text) # removing retweets
  text = re.sub(r'https?:\/\/\S+', "", text) # removing links
  text = ' '.join(word for word in text.split() if len(word) >= min_word_length and word not in stop_words) # Remove short words

  punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
  for p in punctuations:
      text = text.replace(p,'') #Removing punctuations

  # Lemmatize the words
  words = nltk.word_tokenize(text)
  lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
  text = ' '.join(lemmatized_words)

  return text

In [31]:
# clean (preprocess) the IMDb Movie Review dataset
def cleanData2(review, min_word_length = 3):
  review = review.lower()
  review = re.sub(r'@[A-Za-z0-9]+', "", review) # this informs Python the the mentions in text must be substituted with an empty string
  review = re.sub(r'#', "", review) # removing #
  review = re.sub(r'RT[\s]+', "", review) # removing retweets
  review = re.sub(r'https?:\/\/\S+', "", review) # removing links
  review = ' '.join(word for word in review.split() if len(word) >= min_word_length and word not in stop_words) # Remove short words

  punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
  for p in punctuations:
      review = review.replace(p,'') #Removing punctuations

  # Lemmatize the words
  words = nltk.word_tokenize(review)
  lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
  review= ' '.join(lemmatized_words)

  return review

In [32]:
# df['text'] = df['text'].apply(cleanData)

# Apply the cleanData function to all text columns in the dataframe
df = df.applymap(lambda x: cleanData1(x) if isinstance(x, str) else x)

In [33]:
df2 = df2.applymap(lambda x: cleanData2(x) if isinstance(x, str) else x)

## **Splitting into Training and Testing**

In [34]:
# split the dataset into training and testing sets,
# with 80% for training and 20% for testing

x_train = list()
x_test = list()

for sentiment in ["positive", "negative", "neutral"]:
    train, test = train_test_split(df[df.airline_sentiment==sentiment],
                                   train_size = 0.8,
                                   test_size = 0.2,
                                   random_state = 42)

    x_train.append(train)
    x_test.append(test)

In [35]:
# shuffle the training data in a replicable order => random_state=10
x_train = pd.concat(x_train).sample(frac=1, random_state=10)
x_test = pd.concat(x_test)

x_train = x_train.reset_index(drop=True)

In [36]:
# generate prompt for LLaMA
def generate_prompt(data_point, text_column, sentiment_column):
    return f"""
            Analyze the sentiment of the reviews enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text_column"]}] = {data_point["sentiment_column"]}
            """.strip()

def generate_test_prompt(data_point, text_column):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text_column"]}] = """.strip()

In [39]:
# For Twitter
x_train = pd.DataFrame(x_train.apply(lambda x: generate_prompt(x, "text", "airline_sentiment"), axis=1),
                       columns=["text"])

# For testing Twitter & IMDb
y_true = x_test.sentiment
x_test = pd.DataFrame(x_test.apply(lambda x: generate_test_prompt(x, "text" if "text" in x else "review"), axis=1),
                      columns=["text"])

KeyError: 'text_column'

In [None]:
# wrap the train data by the class from Hugging Face (https://huggingface.co/docs/datasets/index)
train_data = Dataset.from_pandas(x_train)

### **Creating a function to evaluate the results from the fine-tuned sentiment model**

In [None]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 1, 'neutral': 0, 'negative': -1}
    def map_func(x):
        return mapping.get(x, 2)

    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    accuracy_perc = accuracy * 100

    print(f"Accuracy: {accuracy_perc:.2f}%")
    print('---------------------------\n')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        accuracy_perc = accuracy * 100
        print(f'Accuracy for label {label}: {accuracy:.2f}%')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[-1, 0, 1])
    print('\nConfusion Matrix:')
    print(conf_matrix)

## **Downloading LLaMA model**

In [None]:
# use standard model and tokeniser from the huggingface transformer

model_name = "meta-llama/Llama-2-7b-hf" # llama model (using 7b parameter)

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)

### **Non Fine-tuning**

In [None]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(x_test))):
        prompt = x_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens = 1,
                        temperature = 0.0,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        else:
            y_pred.append("neutral")

    return y_pred

In [None]:
y_pred = predict(test, model, tokenizer)

In [None]:
evaluate(y_true, y_pred)