In [None]:
!pip install -U datasets trl accelerate peft bitsandbytes packaging ninja sentencepiece transformers einops trl huggingface_hub

In [None]:
!pip install tqdm scipy

In [None]:
import os
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)
from tqdm.notebook import tqdm

from trl import SFTTrainer

In [None]:
from huggingface_hub import interpreter_login
interpreter_login()


In [None]:
dataset = load_dataset("nbertagnolli/counsel-chat", split="train")
dataset

In [None]:
import pandas as pd
# Convert to DataFrame
df = pd.DataFrame(dataset)

# Display the first few rows of the DataFrame
df.head(2)

In [None]:

# Filter the required columns
#filtered_df = df[['questionText', 'topic']].drop_duplicates()
filtered_df = df[['questionText', 'topic', 'answerText']].drop_duplicates(subset=['questionText', 'topic'])
# Rename the columns
filtered_df.columns = ['Context', 'topic', 'Response']

# Group by topic and count the occurrences
topic_counts = filtered_df['topic'].value_counts()

# Calculate the target number of samples per topic for the test set
target_test_size_per_topic = (topic_counts * 0.2).round().astype(int)

# Initialize an empty DataFrame for the test set
test_set_balanced = pd.DataFrame(columns=filtered_df.columns)

# Initialize an empty DataFrame for the train set
train_set_balanced = pd.DataFrame(columns=filtered_df.columns)

# For each topic, randomly select the calculated number of samples to include in the test set
for topic, target_size in target_test_size_per_topic.items():
    samples = filtered_df[filtered_df['topic'] == topic].sample(n=min(target_size, topic_counts[topic]), random_state=42)
    test_set_balanced = pd.concat([test_set_balanced, samples])

        # Add the remaining samples to the train set
    train_set_balanced = pd.concat([train_set_balanced, df[df['topic'] == topic].drop(samples.index)])

print("train data shape",train_set_balanced.shape )
print("test data shape",test_set_balanced.shape )



In [None]:

# Save the balanced test set
test_set_balanced.to_csv('counsel_chat_test_balanced.csv', index=False)

# Save the balanced train set
train_set_balanced.to_csv('counsel_chat_train_balanced.csv', index=False)


# Check the final distribution of topics in the balanced test set
balanced_test_distribution = test_set_balanced['topic'].value_counts()

print(balanced_test_distribution)
print("test data shape: \n",test_set_balanced.head() )

In [None]:
train_df = pd.read_csv("counsel_chat_train_balanced.csv")

# Filter the required columns
df = train_df[['questionText', 'answerText']]
# Rename the columns
df.columns = ['Context', 'Response']

df.head()

In [None]:

# Function to transform the row into desired format
def format_row(row):
    question = row['Context']
    answer = row['Response']
    formatted_string = f"<s>[INST] {question} [/INST] {answer}</s>"
    return formatted_string

# Apply the function to each row of the dataframe
df['Formatted'] = df.apply(format_row, axis=1)

# Rename the 'Formatted' column to 'Text'
new_df = df.rename(columns={'Formatted': 'Text'})

new_df



In [None]:
## 
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming 'final_df' is your DataFrame loaded with pd.read_csv("formatted_data.csv")
#train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)  # Splitting 20% for testing

# Save the train and test datasets to CSV files
#test_df.to_csv('test_data.csv', index=False)
train_df = new_df.sample(frac=1, random_state=42)

test_df = pd.read_csv("counsel_chat_test_balanced.csv")

print(test_df.shape)
print(train_df.shape)

In [None]:

# If you want to save the new dataframe to a CSV file:
train_df = train_df[['Text']]
train_df.to_csv('train_formatted_data.csv', index=False)
final_df = pd.read_csv("train_formatted_data.csv")
final_df.head(2)

In [None]:

training_dataset = load_dataset("csv", data_files="train_formatted_data.csv", split="train")
#print(dataset["Text"][400])
training_dataset

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
base_model = "/kaggle/input/mistral/pytorch/7b-v0.1-hf/1"

# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)


import gc
gc.collect()
torch.cuda.empty_cache()

model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)

model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token


gc.collect()
torch.cuda.empty_cache()

model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

model = get_peft_model(model, peft_config)



In [None]:

#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=1000,
    logging_steps=1000,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="wandb"
)


# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=training_dataset,
    peft_config=peft_config,
    max_seq_length= None, # 690
    dataset_text_field="Text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer.train() 

In [None]:

import torch

# Save the model weights
model.save_pretrained("Mistral7b-finetuned")

# For saving PyTorch model
torch.save(model.state_dict(), "Mistral7b-finetuned.pth")

In [None]:
from tqdm import tqdm
from transformers import pipeline

n = 0
# Placeholder for the generated responses
generated_responses = []

model.config.use_cache = True
model.eval()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, 
                max_length=1024)

test_df = pd.read_csv("counsel_chat_test_balanced.csv")
test_df['Context'] = test_df['Context'].fillna('')

for index, row in tqdm(test_df.iterrows()):
    prompt = f"[INST] {row['Context'][:940]} [/INST]"
    reference = row['Response']
    
    result = pipe(prompt)
    generated_text = result[0]['generated_text']
    #print(generated_text)
    
    generated_responses.append({
            'Context': row['Context'],
            'topic': row['topic'],
            'Response': generated_text
        })
    

    if n < 5:
        print("Context:",row['Context'])
        print("generated_text:",generated_text)
        print("reference:",reference)
    
    n +=1
    
# Create a DataFrame from the generated responses
generated_df = pd.DataFrame(generated_responses)

# Write the DataFrame to an Excel file
generated_df.to_excel('Mistral7B_preds.xlsx', index=False)
