In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## HuggingFace


In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_api_token = user_secrets.get_secret("huggingfaceAPI")

In [None]:
from huggingface_hub import login
login(token=hf_api_token)

## Installation

In [None]:
!pip install peft==0.8.2
!pip install bitsandbytes==0.42.0
!pip install accelerate==0.26.1
!pip install datasets==2.16.1
!pip install GPUtil
!pip install transformers==4.38.0
!pip install peft==0.8.2
!pip install bitsandbytes==0.42.0
!pip install accelerate==0.26.1
!pip install datasets==2.16.1
!pip install evaluate

pip install emoji --upgrade

In [4]:
import re
import emoji
import string
import torch
from transformers Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from textblob import TextBlob
from datasets import Dataset

# Dataset

In [5]:
data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
# Set the maximum column width to a large number for all columns
pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_colwidth')

# Dataset preprocessing

1. Removing emojis, urls, usernames, duplicates, NaN values, hashtags
2. Lowercasing characters
3. Joining hashtags and keywords nad text together

In [None]:
# Preproceanyssing function
def preprocess_text(df):
    # Make dataset lowercase
    df["text"] = df["text"].str.lower()
    
    # Create new table to pass their hashtags
    df["hashtags"] = df["text"].apply(extract_hashtags)
    
    # Remove hashtags
    df["text"] = df["text"].apply(remove_hashtags)
    
    # Remove url links
    df["text"] = df["text"].apply(remove_url_username)
    
    # Remove emojis
    df["text"] = df["text"].apply(remove_emojis)
    
    # Remove special characters
    df['text'] = df['text'].apply(remove_special_characters)
    
    # Remove duplicates
    df = df.drop_duplicates(subset=['text'], keep='first')
    
    df["combined_text"] = df.apply(append_hashtags_text, axis=1)
    
    # Remove NaN values from keywords
    df['keyword'] = data['keyword'].fillna('')
    
    # Concatenate combined_text and keyword
    df['final_text'] = np.where(
        df['keyword'].str.strip() != '',
        "Keyword: " + df['keyword'] + " " + df['combined_text'],
        df['combined_text']
    )
    
    return df


# Extract hashtags
def extract_hashtags(text):
    return re.findall(r"#(\w+)", text)

# Function to remove hashtags from text
def remove_hashtags(text):
    return re.sub(r"#(\w+)", "", text).strip()

def remove_url_username(text):
    url_pattern = r"http[s]?://\S+"
    return re.sub(url_pattern, "", text).strip()

# Remove emojis from text
def remove_emojis(text):
    return emoji.replace_emoji(text, "").strip()

# Remove special characters
def remove_special_characters(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Append hashtags to text
def append_hashtags_text(row):
    hashtags = ", ".join(row['hashtags'])  # Join list with spaces
    text = row['text'].strip()
    if hashtags:
        return f"Hashtags: {hashtags}. Text: {row['text']}"
    else:
        return f"Text: {text}"


data = preprocess_text(data)
test_data = preprocess_text(test_data)

In [11]:
# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Display the size of each dataset
print(f"Training data size: {train_data.shape[0]}")
print(f"Validation data size: {val_data.shape[0]}")

Training data size: 5496
Validation data size: 1375


## DATA CHECK

In [12]:
# Drop uneccessary columns and keep only this
train_data = train_data[['final_text', 'target']].dropna()
val_data = val_data[['final_text', 'target']].dropna()

In [13]:
# Rename columns to "label", then we will pass it LoRA
train_data = train_data.rename(columns={'target': "label"})
val_data = val_data.rename(columns={'target': "label"})

In [19]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          3263 non-null   int64 
 1   final_text  3263 non-null   object
dtypes: int64(1), object(1)
memory usage: 51.1+ KB


# Tokenization

In [40]:
MODEL_ID = "google/gemma-2b-it"
BATCH_SIZE = 8
EPOCHS = 4

In [None]:
# Setup tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=hf_api_token)

print(tokenizer.padding_side, tokenizer.pad_token)

In [22]:
# Keep data as a dataset for data preprocessing in LLM
from datasets import Dataset, DatasetDict

train_data = Dataset.from_pandas(train_data)

val_data = Dataset.from_pandas(val_data)

dataset = DatasetDict({
    'train': train_data,
    'test': val_data
})

In [23]:
# Divide data to the train and test and keep in Dataset

tokenized_dataset = {}

for split in dataset.keys():
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["final_text"], truncation = True), batched = True)

tokenized_dataset["train"], tokenized_dataset["test"]

Map:   0%|          | 0/5496 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1375 [00:00<?, ? examples/s]

(Dataset({
     features: ['final_text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
     num_rows: 5496
 }),
 Dataset({
     features: ['final_text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
     num_rows: 1375
 }))

# Gemma 2B

## Testing data

In [24]:
# Label the data. LLM will output answers using this labels
id2label = {0: "NOT HAZARDOUS", 1: "HAZARDOUS"}
label2id = {"NOT HAZARDOUS": 0, "HAZARDOUS": 1}

### Loading model to the GPU

In [None]:
# Load the model

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels = 2, # sentiment classification: true or false
    id2label = id2label,
    label2id = label2id, 
    load_in_8bit = True, # load 8-bit quantized model
    token=hf_api_token # use huggingface API to load the model)

print(model.config.pad_token_id)

In [26]:
print(model)

GemmaForSequenceClassification(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear8bitLt(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear8bitLt(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear8bitLt(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear8bitLt(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear8bitLt(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_laye

In [None]:
# Vanilla model to LoRA model

from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)

model

In [28]:
from peft import LoraConfig, TaskType, get_peft_model

lora_config = LoraConfig(
    r = 64,
    lora_alpha = 32,
    lora_dropout = 0.1,
    task_type = TaskType.SEQ_CLS,
    target_modules = "all-linear"
)

lora_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.SEQ_CLS: 'SEQ_CLS'>, inference_mode=False, r=64, target_modules='all-linear', lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})

In [None]:
lora_model = get_peft_model(model, lora_config)

In [30]:
# We will fine-tune model on 3 perecent of weights
lora_model.print_trainable_parameters()

trainable params: 78,581,888 || all params: 2,584,889,600 || trainable%: 3.040048132036277


## Train the model

In [32]:
# Early stopping technqiue for training our model

from transformers import EarlyStoppingCallback

early_stop = EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=.0)

In [43]:
#import evaluate
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

# We will compute the metric of accuracy

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis = 1)
    return {"accuracy": (predictions == labels).mean()}

# This will train our model
trainer = Trainer(
    model = lora_model,
    args = TrainingArguments(
        output_dir = "./data/",
        learning_rate= 2e-5,
        per_device_train_batch_size = BATCH_SIZE,
        per_device_eval_batch_size = BATCH_SIZE,
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        num_train_epochs = EPOCHS,
        weight_decay = 0.01,
        load_best_model_at_end = True,
        logging_steps = 10,
        report_to = "none"
    ),
    
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    data_collator = DataCollatorWithPadding(tokenizer = tokenizer),
    compute_metrics = compute_metrics,
    #callbacks=[early_stop]  # early stopping callback
)

In [44]:
print("Train dataset size:", len(tokenized_dataset["train"]))
print("Test dataset size:", len(tokenized_dataset["test"]))

Train dataset size: 5496
Test dataset size: 1375


In [45]:
import torch

torch.cuda.empty_cache()

In [36]:
print("Evaluating the model before training!")
trainer.evaluate()

Evaluating the model before training!




{'eval_loss': 0.7840155959129333,
 'eval_accuracy': 0.6116363636363636,
 'eval_f1': 0.31185567010309273,
 'eval_precision': 0.599009900990099,
 'eval_recall': 0.21080139372822299,
 'eval_runtime': 108.9325,
 'eval_samples_per_second': 12.622,
 'eval_steps_per_second': 1.579}

In [46]:
print("Training the model")
trainer.train()

Training the model




Epoch,Training Loss,Validation Loss,Accuracy
1,0.5986,0.512927,0.821818
2,0.334,0.801839,0.812364




TrainOutput(global_step=2748, training_loss=0.4880185938037639, metrics={'train_runtime': 3282.9595, 'train_samples_per_second': 8.37, 'train_steps_per_second': 2.093, 'total_flos': 4836396803328000.0, 'train_loss': 0.4880185938037639, 'epoch': 2.0})

In [47]:
print("Evaluating the trained model")
trainer.evaluate()

Evaluating the trained model




{'eval_loss': 0.5117854475975037,
 'eval_accuracy': 0.8247272727272728,
 'eval_runtime': 144.9292,
 'eval_samples_per_second': 9.487,
 'eval_steps_per_second': 2.374,
 'epoch': 2.0}

In [48]:
print("Saving the model")
lora_model.save_pretrained('fine-tuned-model')

Saving the model




# Making predictions

In [None]:
from transformers import pipeline

clf = pipeline("text-classification", lora_model, tokenizer = MODEL_ID)

In [63]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             3263 non-null   int64 
 1   combined_text  3263 non-null   object
 2   label          3263 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 76.6+ KB


In [54]:
test_data = test_data[['id', 'combined_text']].dropna()

In [66]:
test_data.head()

Unnamed: 0,id,combined_text,label
0,0,Text: just happened a terrible car crash,0
1,2,Hashtags: earthquake. Text: heard about is di...,0
2,3,Text: there is a forest fire at spot pond gees...,0
3,9,"Hashtags: spokane, wildfires. Text: apocalypse...",0
4,11,Text: typhoon soudelor kills 28 in china and t...,0


In [62]:
test_data['label'] = 0

In [78]:
from tqdm import tqdm

predictions = []

print("Making prediction on test dataset...")

for text in tqdm(test_data['combined_text'].values):
    prediction = clf(text)
    predicted_label = prediction[0]['label']
    
    #print(predicted_label)
    
    # Convert the string label to an integer using the label2id dictionary
    prediction_int = label2id.get(predicted_label, None)  # Use None or a default value if label is missing

    # Append the integer prediction to the list
    predictions.append(prediction_int)
    
# Add the predictions to the DataFrame
#test_data['label'] = predictions

Making prediction on test dataset...


100%|██████████| 3263/3263 [12:45<00:00,  4.26it/s]


In [80]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

sample_submission['target'] = predictions
sample_submission.to_csv(f'submission.csv', index=False)