In [1]:
!pip install -q kaggle

In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"codextagc","key":"2502ec6c46b25dd95d23971561295364"}'}

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d andrewmvd/trip-advisor-hotel-reviews
!unzip trip-advisor-hotel-reviews.zip

Dataset URL: https://www.kaggle.com/datasets/andrewmvd/trip-advisor-hotel-reviews
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading trip-advisor-hotel-reviews.zip to /content
  0% 0.00/5.14M [00:00<?, ?B/s]
100% 5.14M/5.14M [00:00<00:00, 647MB/s]
Archive:  trip-advisor-hotel-reviews.zip
  inflating: tripadvisor_hotel_reviews.csv  


In [None]:
import pandas as pd
df = pd.read_csv('/content/tripadvisor_hotel_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [None]:
df.shape

(20491, 2)

In [None]:
df.isna().sum()

Unnamed: 0,0
Review,0
Rating,0


In [None]:
df.duplicated().sum()

np.int64(0)

## **Data Preprocessing**

In [5]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
def text_cleaning_for_sentiment(text):
  text = re.sub('[^a-zA-Z]',' ',text)
  text = text.lower()
  text = text.split()
  text = [ps.stem(word) for word in text if not word in stop_words]
  text = ' '.join(text)
  return text

In [None]:
df['Review_cleaned_sen'] = df['Review'].apply(text_cleaning_for_sentiment)
df.sample(1)

Unnamed: 0,Review,Rating,Review_cleaned_sen
7620,ideal base la hotel properties consider centra...,5,ideal base la hotel properti consid central ho...


In [7]:
def clean_text_for_summary(text):
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
df['Review_cleaned_sum'] = df['Review'].apply(clean_text_for_summary)
df.head()

Unnamed: 0,Review,Rating,Review_cleaned_sen,Review_cleaned_sum
0,nice hotel expensive parking got good deal sta...,4,nice hotel expens park got good deal stay hote...,nice hotel expensive parking got good deal sta...
1,ok nothing special charge diamond member hilto...,2,ok noth special charg diamond member hilton de...,ok nothing special charge diamond member hilto...
2,nice rooms not 4* experience hotel monaco seat...,3,nice room experi hotel monaco seattl good hote...,nice rooms not 4* experience hotel monaco seat...
3,"unique, great stay, wonderful time hotel monac...",5,uniqu great stay wonder time hotel monaco loca...,"unique, great stay, wonderful time hotel monac..."
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,"great stay great stay, went seahawk game aweso..."


In [8]:
def transform_rating(rating):
  if rating > 3:
    return 'POSITIVE'
  else:
    return 'NEGATIVE'

In [None]:
df['Sentiment'] = df['Rating'].apply(transform_rating)
df.head(1)

Unnamed: 0,Review,Rating,Review_cleaned_sen,Review_cleaned_sum,Sentiment
0,nice hotel expensive parking got good deal sta...,4,nice hotel expens park got good deal stay hote...,nice hotel expensive parking got good deal sta...,POSITIVE


## **Machine Learning Algorithms**

In [None]:
from sklearn.model_selection import train_test_split
X_sen = df['Review_cleaned_sen']
y_sen = df['Sentiment']

X_train_sen, X_test_sen, y_train_sen, y_test_sen = \
train_test_split(X_sen, y_sen, test_size=0.2, random_state=42, stratify=y_sen)

X_train_sen.shape, X_test_sen.shape, y_train_sen.shape, y_test_sen.shape

((16392,), (4099,), (16392,), (4099,))

In [None]:
vectorizer = TfidfVectorizer()
X_train_sen = vectorizer.fit_transform(X_train_sen)
X_test_sen = vectorizer.transform(X_test_sen)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_sen = le.fit_transform(y_train_sen)
y_test_sen = le.transform(y_test_sen)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_sen, y_train_sen)
rf_pred = rf_model.predict(X_test_sen)

print(accuracy_score(y_test_sen, rf_pred))
print(classification_report(y_test_sen, rf_pred))
print(confusion_matrix(y_test_sen, rf_pred))

0.8402049280312271
              precision    recall  f1-score   support

    NEGATIVE       0.96      0.41      0.58      1080
    POSITIVE       0.83      0.99      0.90      3019

    accuracy                           0.84      4099
   macro avg       0.89      0.70      0.74      4099
weighted avg       0.86      0.84      0.82      4099

[[ 446  634]
 [  21 2998]]


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_sen = le.fit_transform(y_train_sen)
y_test_sen = le.transform(y_test_sen)

xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train_sen, y_train_sen)
xgb_pred = xgb_model.predict(X_test_sen)

print(accuracy_score(y_test_sen, xgb_pred))
print(classification_report(y_test_sen, xgb_pred))
print(confusion_matrix(y_test_sen, xgb_pred))

0.8728958282507929
              precision    recall  f1-score   support

           0       0.81      0.68      0.74      1080
           1       0.89      0.94      0.92      3019

    accuracy                           0.87      4099
   macro avg       0.85      0.81      0.83      4099
weighted avg       0.87      0.87      0.87      4099

[[ 732  348]
 [ 173 2846]]


In [None]:
!pip install -q catboost

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations=100, random_state=42, verbose=0)
cat_model.fit(X_train_sen, y_train_sen)
cat_pred = cat_model.predict(X_test_sen)

print(accuracy_score(y_test_sen, cat_pred))
print(classification_report(y_test_sen, cat_pred))

0.8753354476701635
              precision    recall  f1-score   support

    NEGATIVE       0.83      0.67      0.74      1080
    POSITIVE       0.89      0.95      0.92      3019

    accuracy                           0.88      4099
   macro avg       0.86      0.81      0.83      4099
weighted avg       0.87      0.88      0.87      4099



## **Evaluating by Pre-trained Transformer**

In [None]:
from transformers import pipeline

sentiment_clf = pipeline("sentiment-analysis",
                         model="nlptown/bert-base-multilingual-uncased-sentiment",
                         truncation=True,
                        padding=True)

print(sentiment_clf("The hotel staff were amazing and the room was spotless."))

Device set to use cuda:0


[{'label': '5 stars', 'score': 0.49145591259002686}]


In [None]:
sentiment_clf.model.config.id2label

{0: '1 star', 1: '2 stars', 2: '3 stars', 3: '4 stars', 4: '5 stars'}

In [None]:
df['sentiment_pred'] = df['Review_cleaned_sen'].apply(lambda x: sentiment_clf(x)[0]['label'])
df.head()

Unnamed: 0,Review,Rating,Review_cleaned_sen,Review_cleaned_sum,Sentiment,sentiment_pred
0,nice hotel expensive parking got good deal sta...,4,nice hotel expens park got good deal stay hote...,nice hotel expensive parking got good deal sta...,POSITIVE,3 stars
1,ok nothing special charge diamond member hilto...,2,ok noth special charg diamond member hilton de...,ok nothing special charge diamond member hilto...,NEGATIVE,3 stars
2,nice rooms not 4* experience hotel monaco seat...,3,nice room experi hotel monaco seattl good hote...,nice rooms not 4* experience hotel monaco seat...,NEGATIVE,3 stars
3,"unique, great stay, wonderful time hotel monac...",5,uniqu great stay wonder time hotel monaco loca...,"unique, great stay, wonderful time hotel monac...",POSITIVE,5 stars
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,"great stay great stay, went seahawk game aweso...",POSITIVE,5 stars


In [None]:
df.sentiment_pred = df.sentiment_pred.apply(lambda x: int(x.split()[0]))
df.head()

Unnamed: 0,Review,Rating,Review_cleaned_sen,Review_cleaned_sum,Sentiment,sentiment_pred
0,nice hotel expensive parking got good deal sta...,4,nice hotel expens park got good deal stay hote...,nice hotel expensive parking got good deal sta...,POSITIVE,3
1,ok nothing special charge diamond member hilto...,2,ok noth special charg diamond member hilton de...,ok nothing special charge diamond member hilto...,NEGATIVE,3
2,nice rooms not 4* experience hotel monaco seat...,3,nice room experi hotel monaco seattl good hote...,nice rooms not 4* experience hotel monaco seat...,NEGATIVE,3
3,"unique, great stay, wonderful time hotel monac...",5,uniqu great stay wonder time hotel monaco loca...,"unique, great stay, wonderful time hotel monac...",POSITIVE,5
4,"great stay great stay, went seahawk game aweso...",5,great stay great stay went seahawk game awesom...,"great stay great stay, went seahawk game aweso...",POSITIVE,5


In [None]:
df['sentiment_pred'] = df['sentiment_pred'].apply(lambda x: 'POSITIVE' if x > 3 else 'NEGATIVE')
df.head(1)

Unnamed: 0,Review,Rating,Review_cleaned_sen,Review_cleaned_sum,Sentiment,sentiment_pred
0,nice hotel expensive parking got good deal sta...,4,nice hotel expens park got good deal stay hote...,nice hotel expensive parking got good deal sta...,POSITIVE,NEGATIVE


In [None]:
df.sentiment_pred.value_counts(), df.Sentiment.value_counts()

(sentiment_pred
 POSITIVE    12220
 NEGATIVE     8271
 Name: count, dtype: int64,
 Sentiment
 POSITIVE    15093
 NEGATIVE     5398
 Name: count, dtype: int64)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
y_true = df['Sentiment']
y_pred = df['sentiment_pred']
print(accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))

0.7557464252598702
              precision    recall  f1-score   support

    NEGATIVE       0.52      0.80      0.63      5398
    POSITIVE       0.91      0.74      0.82     15093

    accuracy                           0.76     20491
   macro avg       0.72      0.77      0.73     20491
weighted avg       0.81      0.76      0.77     20491

[[ 4332  1066]
 [ 3939 11154]]


## **Fine Tuning with Meta-Llama**

In [9]:
import pandas as pd
tuning_df = pd.read_csv('/content/tripadvisor_hotel_reviews.csv')

In [10]:
tuning_df['Sentiment'] = tuning_df['Rating'].apply(transform_rating)
tuning_df.head(1)

Unnamed: 0,Review,Rating,Sentiment
0,nice hotel expensive parking got good deal sta...,4,POSITIVE


In [11]:
tuning_df['Cleaned_text'] = tuning_df['Review'].apply(text_cleaning_for_sentiment)
tuning_df.head(1)

Unnamed: 0,Review,Rating,Sentiment,Cleaned_text
0,nice hotel expensive parking got good deal sta...,4,POSITIVE,nice hotel expens park got good deal stay hote...


In [12]:
!pip install -q transformers datasets accelerate peft trl

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/511.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
df_inst = pd.DataFrame({
    "instruction": "Classify the sentiment of the hotel review.",
    "input": tuning_df["Cleaned_text"],
    "output": tuning_df["Sentiment"]
})

In [14]:
df_inst.head(1)

Unnamed: 0,instruction,input,output
0,Classify the sentiment of the hotel review.,nice hotel expens park got good deal stay hote...,POSITIVE


In [15]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_inst)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset, eval_dataset = dataset['train'], dataset['test']

In [16]:
!pip install -q huggingface_hub

from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
!pip install -q -U bitsandbytes
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    torch_dtype=torch.float16,
)

print("Model loaded with 4-bit quantization!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded with 4-bit quantization!


In [18]:
from peft import prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

In [19]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

In [None]:
token_lengths = tuning_df["Cleaned_text"].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))
max_length = token_lengths.max()
print(f"Maximum token length: {max_length}")

Maximum token length: 2643


In [None]:
token_lengths.quantile(0.95)

np.float64(366.0)

In [None]:
train_dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 16392
})

In [20]:
model.print_trainable_parameters()

trainable params: 16,777,216 || all params: 6,755,192,832 || trainable%: 0.2484


In [21]:
def tokenize(batch):
    sources = [
        f"Instruction: {instr}\nInput: {inp}\nOutput:"
        for instr, inp in zip(batch["instruction"], batch["input"])
    ]
    targets = batch["output"]

    texts = [src + " " + tgt for src, tgt in zip(sources, targets)]

    model_inputs = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=512
    )

    model_inputs["labels"] = model_inputs["input_ids"].copy()

    return model_inputs


In [22]:
tokenized_train = train_dataset.map(tokenize, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval = eval_dataset.map(tokenize, batched=True, remove_columns=eval_dataset.column_names)

Map:   0%|          | 0/16392 [00:00<?, ? examples/s]

Map:   0%|          | 0/4099 [00:00<?, ? examples/s]

In [26]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./llama-sentiment-finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none"
)

In [28]:
small_train_dataset = tokenized_train.select(range(500))
small_eval_dataset = tokenized_eval.select(range(100))

In [29]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=63, training_loss=4.016976099165659, metrics={'train_runtime': 808.0073, 'train_samples_per_second': 0.619, 'train_steps_per_second': 0.078, 'total_flos': 1.0174649597952e+16, 'train_loss': 4.016976099165659, 'epoch': 1.0})

In [30]:
trainer.save_model()
tokenizer.save_pretrained("./llama-sentiment-finetuned")

('./llama-sentiment-finetuned/tokenizer_config.json',
 './llama-sentiment-finetuned/special_tokens_map.json',
 './llama-sentiment-finetuned/chat_template.jinja',
 './llama-sentiment-finetuned/tokenizer.model',
 './llama-sentiment-finetuned/added_tokens.json',
 './llama-sentiment-finetuned/tokenizer.json')

In [32]:
def predict_sentiment(review_text):
    prompt = f"### Instruction:\nClassify the sentiment of this hotel review.\n\n### Input:\n{review_text}\n\n### Response:\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = full_response.split("### Response:")[-1].strip()

    return response



In [33]:
def quick_test():
    test_samples = [
        "The hotel was amazing with great service",
        "Terrible experience with dirty rooms",
        "Average hotel nothing special"
    ]

    for sample in test_samples:
        result = predict_sentiment(sample)
        print(f"Review: {sample}")
        print(f"Prediction: {result}")
        print("-" * 50)

quick_test()

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incomp

Review: The hotel was amazing with great service
Prediction: POS Inst Inst Inst Inst Inst Inst Inst Inst Inst
--------------------------------------------------


Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in 

Review: Terrible experience with dirty rooms
Prediction: N Inst Inst Inst Inst Inst Inst Inst Inst Inst
--------------------------------------------------


Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in LlamaDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in 

Review: Average hotel nothing special
Prediction: N Inst Inst Inst Inst Inst Inst Inst Inst Inst
--------------------------------------------------


## **Fine Tuning with Distilbert**

In [17]:
!pip install -q transformers datasets evaluate scikit-learn accelerate

import numpy as np, pandas as pd, torch, evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer, set_seed)

set_seed(42)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [18]:
df = pd.read_csv("/content/tripadvisor_hotel_reviews.csv")
df = df.dropna(subset=["Review","Rating"]).reset_index(drop=True)

def map_sent(r):
  return 1 if int(r) >= 3 else 0

df['Review_cleaned'] = df.Review.apply(text_cleaning_for_sentiment)

df = df.rename(columns={"Review_cleaned":"text"})
df["label"] = df["Rating"].apply(map_sent)

df = df[["text","label"]]

In [19]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
})

In [20]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
MAX_LEN = 512

def tok(b):
  return tokenizer(b["text"], truncation=True, max_length=MAX_LEN)

ds_tok = ds.map(tok, batched=True, remove_columns=["text"])
collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/16392 [00:00<?, ? examples/s]

Map:   0%|          | 0/4099 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
def compute_metrics(p):
    logits, labels = p
    preds = np.argmax(logits, axis=1)
    pr, re, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": pr, "recall": re, "f1": f1}

In [22]:
args = TrainingArguments(
    output_dir="./distilbert-trip-sentiment",
    per_device_train_batch_size=16 if torch.cuda.is_available() else 8,
    per_device_eval_batch_size=16 if torch.cuda.is_available() else 8,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available(),
    report_to="none",
    logging_steps=50,
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["validation"],
    tokenizer=tokenizer, data_collator=collator,
    compute_metrics=compute_metrics,
)
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1581,0.19531,0.918273,0.930713,0.918273,0.922143
2,0.1639,0.185001,0.938034,0.938698,0.938034,0.938343
3,0.0935,0.213766,0.938766,0.937114,0.938766,0.937639


TrainOutput(global_step=3075, training_loss=0.16519501065820213, metrics={'train_runtime': 578.2578, 'train_samples_per_second': 85.042, 'train_steps_per_second': 5.318, 'total_flos': 4881918323415456.0, 'train_loss': 0.16519501065820213, 'epoch': 3.0})

In [23]:
preds = trainer.predict(ds_tok["validation"])
y_true = preds.label_ids
y_pred = preds.predictions.argmax(axis=1)
print("Eval:", compute_metrics((preds.predictions, y_true)))
print("\nReport:\n", classification_report(y_true, y_pred, target_names=["NEGATIVE","POSITIVE"]))
print("\nConfusion matrix:\n", confusion_matrix(y_true, y_pred))


Eval: {'accuracy': 0.9380336667479873, 'precision': 0.9386982691151181, 'recall': 0.9380336667479873, 'f1': 0.9383425255696928}

Report:
               precision    recall  f1-score   support

    NEGATIVE       0.80      0.81      0.80       643
    POSITIVE       0.97      0.96      0.96      3456

    accuracy                           0.94      4099
   macro avg       0.88      0.89      0.88      4099
weighted avg       0.94      0.94      0.94      4099


Confusion matrix:
 [[ 524  119]
 [ 135 3321]]


In [24]:
save_dir = "./distilbert-trip-sentiment-best"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

('./distilbert-trip-sentiment-best/tokenizer_config.json',
 './distilbert-trip-sentiment-best/special_tokens_map.json',
 './distilbert-trip-sentiment-best/vocab.txt',
 './distilbert-trip-sentiment-best/added_tokens.json',
 './distilbert-trip-sentiment-best/tokenizer.json')

In [25]:
import torch.nn.functional as F
id2label = {0:"NEGATIVE",1:"POSITIVE"}
def predict_sentiment(texts):
    enc = tokenizer(texts, truncation=True, max_length=MAX_LEN, padding=True, return_tensors="pt").to(model.device)
    with torch.no_grad():
        probs = F.softmax(model(**enc).logits, dim=-1).cpu().numpy()
    idx = probs.argmax(1)
    return [{"label": id2label[i], "score": float(probs[j,i])} for j,i in enumerate(idx)]


In [26]:
print(predict_sentiment([
    "Amazing stay, spotless rooms and warm staff.",
    "AC broken, rude reception, would not return."
]))

[{'label': 'POSITIVE', 'score': 0.9877350926399231}, {'label': 'NEGATIVE', 'score': 0.9212159514427185}]
