In [None]:
import torch 
from torch.utils.data import DataLoader
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.utils.class_weight import compute_class_weight

In [None]:
from fine_tune import model_performance, fine_tune_llm_model,ConversationDataset


In [None]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
device

In [None]:

#model_name = "distilbert-base-uncased"
# model_name = "bert-base-uncased"

model_name = "bert-base-uncased" 
#max_length = 128  # You can increase it up to 512 if needed
batch_size = 16
max_length=128
model_pretrained = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6).to(device)

model_tokeniser = AutoTokenizer.from_pretrained(model_name)



In [None]:
df = pd.read_parquet("/Users/ritkumar17/Desktop/R&D_Project/LLM_fine/data/train-00000-of-00001-a5a7c6e4bb30b016.parquet", columns= ["issue_area", "conversation"])

# EDA 


In [None]:
for i in df.head(1)["conversation"]:
    print(i)

In [None]:
df.isnull().sum()

In [None]:
df["conversation"] = df["conversation"].apply(lambda text : re.sub(r"^(Agent:|Customer:)\s*", "", text))

In [None]:
df.head()

In [None]:
label_counts = df['issue_area'].value_counts(normalize=True) * 100  
colors = ["#B0E0E6", "#87CEEB", "#ADD8E6", "#AFEEEE", "#BFEFFF", "#E0FFFF"]

plt.figure(figsize=(6, 6))
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%', 
        colors=colors, startangle=140)
plt.title("Issue Percentage Distribution")
plt.show()


In [None]:
df['token_length'] = df["conversation"].apply(lambda text : len(model_tokeniser.tokenize(text)))

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df['token_length'], bins=30, kde=True, color="blue", edgecolor="black")

plt.xlabel("Token Length", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Token Length Distribution", fontsize=14)
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

#### Key Observations
*   Peak around 450 tokens (most common length).
*   Long-tail effect with some very long samples (~1200 tokens).
*   Almost no short texts (<100 tokens).

In [None]:
# Encode Labels
labels = df["issue_area"].unique().tolist()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()} # BERT treats labels as distinct categories, not numbers.
df["label"] = df["issue_area"].map(label2id)

In [None]:
df["issue_area"].unique()

In [None]:
df_data = df[["conversation", "label", "issue_area"]]

In [None]:
df_data.to_parquet("clean_dataset.parquet")

In [None]:
df_data

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df_data["conversation"], df_data["label"], test_size=.2, random_state=22,stratify=df_data["label"])
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=.1, random_state=22,stratify=train_y)

In [None]:
train_dataset = ConversationDataset(train_x, train_y, model_tokeniser)
val_dataset= ConversationDataset(val_x, val_y, model_tokeniser)
test_dataset = ConversationDataset(test_x, test_y, model_tokeniser)

In [None]:
# Load Data into DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
val_loader = DataLoader(val_dataset,batch_size=16, shuffle=False)

In [None]:
fine_tune_model = fine_tune_llm_model(num_epochs =10 ,train_loader= train_loader,val_loader= val_loader, model=model_pretrained, device=device)

In [None]:
logit=model_performance(fine_tune_model, test_loader,device, id2label)

In [None]:
#fine_tune_model.save_pretrained("LLM_fine_Tune_2")