<a href="https://colab.research.google.com/github/VinaySingh561/LARGE-LANGUAGE-MODELS/blob/main/Fine_tuning_the_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Finetuning for classification

In [15]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.2 MB[0m [31m2.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.2 MB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [2]:
## Downloading the dataset
import urllib.request
import ssl
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url,zip_path,extracted_path,data_file_path):
  if data_file_path.exists():
    print(f"{data_file_path} already exists. skipping the download and extraction")
    return
  ssl_context = ssl._create_unverified_context()

  with urllib.request.urlopen(url,context=ssl_context) as response:
    with open(zip_path,"wb") as out_file:
      out_file.write(response.read())

  with zipfile.ZipFile(zip_path,"r") as zip_ref:
    zip_ref.extractall(extracted_path)

  ## add .tsv file extension
  original_file_path = Path(extracted_path)/"SMSSpamCollection"
  os.rename(original_file_path, data_file_path)
  print(f"File downloaded and saved as {data_file_path}")


download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)


File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv


In [3]:
import pandas as pd
df = pd.read_csv(data_file_path, sep = "\t", header = None, names = ["Label", "text"])

In [4]:
df

Unnamed: 0,Label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
print(df['Label'].value_counts())

Label
ham     4825
spam     747
Name: count, dtype: int64


not equal so we can continue with this or make it equally distributed

In [6]:
def create_balanced_dataset(df):
  num_spam = df[df["Label"]=="spam"].shape[0]

  ham_subset = df[df["Label"]=="ham"].sample(num_spam, random_state = 1334)

  balanced_df = pd.concat([ham_subset,df[df["Label"]=="spam"]])

  return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [7]:
## now map the spam and ham to 0 and 1
balanced_df["Label"] = balanced_df["Label"].map({"ham":0, "spam":1})

In [8]:
balanced_df

Unnamed: 0,Label,text
2433,0,Really dun bluff me leh... U sleep early too. ...
4312,0,It so happens that there r 2waxsto do wat you ...
2657,0,Feel like trying kadeem again? :V
1377,0,Auntie huai juan never pick up her phone
3151,0,Yo! Howz u? girls never rang after india. L
...,...,...
5537,1,Want explicit SEX in 30 secs? Ring 02073162414...
5540,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,1,Had your contract mobile 11 Mnths? Latest Moto...
5566,1,REMINDER FROM O2: To get 2.50 pounds free call...


In [28]:
### spliting the data into 0.7 train, 0.1 val, 0.2 in test

def random_sample(df, train_frac, val_frac):
  ## shuffle the entire dataframe
  df = df.sample(frac = 1, random_state = 123).reset_index(drop = True)

  train_end  = int(len(df)*train_frac)
  val_end = train_end + int(len(df)*val_frac)

  train_df = df[:train_end]
  val_df = df[train_end:val_end]
  test_df = df[val_end:]

  return train_df, val_df, test_df


train_df, validation_df, test_df = random_sample(balanced_df, train_frac=0.7, val_frac=0.1)


In [29]:
print("Training shape",len(train_df))
print("Validation shape" , len(validation_df))
print("Test shape", len(test_df))
print("Original shape", len(df))

Training shape 1045
Validation shape 149
Test shape 300
Original shape 5572


In [30]:
## saving as csv files
train_df.to_csv("train.csv", index = None)
validation_df.to_csv("validation.csv", index = None)
test_df.to_csv("test.csv", index = None)

### Creating data Loaders

In [31]:
import torch
from torch.utils.data import Dataset

class SpamDataset(Dataset):
  def __init__(self, csvfile,tokenizer,max_length = None, pad_token = 50256):
    self.data = pd.read_csv(csvfile)

    self.encoded_text = [
        tokenizer.encode(text) for text in self.data['text']
    ]

    if max_length is None:
      self.max_length = self._longest_encoded_length()
    else:
      self.max_length= max_length
      ## truncate sequences if they are longer than max_length
      self.encoded_text = [
          text[:max_length] for text in self.encoded_text
      ]

    ## padding smaller texts
    self.encoded_text = [
        encoded_text + [pad_token]*(self.max_length - len(encoded_text))
        for encoded_text in self.encoded_text
    ]


  def __getitem__(self, index):
    encoded = self.encoded_text[index]
    label = self.data.iloc[index]["Label"]
    return (
        torch.tensor(encoded, dtype = torch.long),
        torch.tensor(label, dtype = torch.long)

    )

  def __len__(self):
    return len(self.data)


  def _longest_encoded_length(self):
    max_length = 0
    for encoded_text in self.encoded_text:
      encoded_length = len(encoded_text)
      if encoded_length > max_length:
        max_length = encoded_length
    return max_length




In [32]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
train_dataset = SpamDataset(
    csvfile = "train.csv",
    max_length=None,
    tokenizer = tokenizer
)

print(train_dataset.max_length)

257


In [33]:
val_dataset = SpamDataset(
    csvfile = "validation.csv",
    max_length=train_dataset.max_length,
    tokenizer = tokenizer
)

test_dataset = SpamDataset(
    csvfile = "test.csv",
    max_length=train_dataset.max_length,
    tokenizer = tokenizer
)


print(test_dataset.max_length)

257


In [37]:
### creating dataloader
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset = train_dataset,
    batch_size=batch_size,
    shuffle = True,
    num_workers=num_workers,
    drop_last = True
)
val_loader = DataLoader(
    dataset = val_dataset,
    batch_size=batch_size,
    shuffle = False,
    num_workers=num_workers,
    drop_last = True
)
test_loader = DataLoader(
    dataset = test_dataset,
    batch_size=batch_size,
    shuffle = False,
    num_workers=num_workers,
    drop_last = True
)

In [38]:
print("Train Loader : ")

for input_batch, target_batch in train_loader:
  pass

print("Input batch dimensions ", input_batch.shape)
print("Label batch dimensions", target_batch.shape)

Train Loader : 
Input batch dimensions  torch.Size([8, 257])
Label batch dimensions torch.Size([8])


In [39]:
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} test batches")

130 training batches
18 validation batches
37 test batches
