In [1]:
import nltk
import torch
import numpy as np
import pandas as pd
import transformers
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data=pd.read_csv("./train_data_chatbot.csv")

In [3]:
data.head()

Unnamed: 0,short_question,short_answer,tags,label
0,can an antibiotic through an iv give you a ras...,yes it can even after you have finished the pr...,['rash' 'antibiotic'],1.0
1,can you test positive from having the hep b va...,test positive for what if you had a hep b vacc...,['hepatitis b'],1.0
2,what are the dietary restrictions for celiac d...,omitting gluten from the diet is the key to co...,['celiac disease'],1.0
3,can i transmit genital warts seventeen years a...,famotidine pepcid products is in a drug class ...,['wart'],-1.0
4,is all vitamin d the same,hi this means you do not have hepatitis b and ...,['vitamin d'],-1.0


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [6]:
questions = data["short_question"].astype(str).tolist()
answers = data["short_answer"].astype(str).tolist()

In [7]:
def clean_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

In [8]:
cleaned_data_q = [clean_text(text) for text in questions]
# Flatten the token lists to get a single list of tokens
all_tokens = [token for sublist in cleaned_data_q for token in sublist]
cleaned_data_q = [" ".join(tokens) for tokens in cleaned_data_q]

In [9]:
cleaned_data_a = [clean_text(text) for text in answers]
# Flatten the token lists to get a single list of tokens
all_tokens = [token for sublist in cleaned_data_a for token in sublist]
cleaned_data_a = [" ".join(tokens) for tokens in cleaned_data_a]

In [10]:
X = cleaned_data_q
y = cleaned_data_a

In [11]:
from sklearn.model_selection import train_test_split
symp_train, symp_test, y_train, y_test = train_test_split(
   X, y, test_size = 0.20, random_state = 4)

In [12]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X)

In [13]:
def tokenize_function(examples):
    return tokenizer(examples["question"], examples["answer"], truncation=True, padding="max_length", max_length=128)

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("./Mistral7B")
tokenizer.pad_token = tokenizer.eos_token  # or use `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`
data_dict = {"question": cleaned_data_q, "answer": cleaned_data_a}
dataset = Dataset.from_dict(data_dict)
tokenized_dataset = dataset.map(tokenize_function, batched=True)




Map:   0%|          | 0/47603 [00:00<?, ? examples/s]

In [15]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:


# Load the tokenizer and model from the local directory
model = AutoModelForCausalLM.from_pretrained("./Mistral7B" , quantization_config=bnb_config, device_map="auto")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [17]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)

In [None]:
from datetime import datetime
project = "journal-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

training_args = TrainingArguments(
    output_dir=output_dir,
    warmup_steps=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    max_steps=200,
    learning_rate=2.5e-3,
    bf16=True,
    optim="paged_adamw_8bit",
    logging_steps=25,              # When to start reporting loss
    logging_dir="./logs",        # Directory for storing logs
    save_strategy="steps",       # Save the model checkpoint every logging step
    save_steps=25,                # Save checkpoints every 50 steps
    eval_strategy="steps", # Evaluate the model every logging step
    eval_steps=25,               # Evaluate and save checkpoints every 50 steps
    do_eval=True,                # Perform evaluation at the end of training
    report_to="none",                   # Disable wandb logging
    run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
)


In [19]:
from transformers import Trainer
from datetime import datetime
from accelerate import Accelerator

In [20]:

accelerator = Accelerator()
model = accelerator.prepare_model(model)


In [21]:

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


max_steps is given, it will override any value given in num_train_epochs


In [22]:
trainer.train()



Step,Training Loss,Validation Loss
25,83.762,88.469246
50,77.3384,65.406128
75,58.3259,56.152134
100,50.9888,48.491367
125,41.5093,45.079681
150,43.2457,41.103291
175,42.8755,38.507347
200,32.9363,34.945564




TrainOutput(global_step=200, training_loss=53.872742614746095, metrics={'train_runtime': 60618.1076, 'train_samples_per_second': 0.007, 'train_steps_per_second': 0.003, 'total_flos': 2211493340774400.0, 'train_loss': 53.872742614746095, 'epoch': 0.008402655239055541})

In [None]:
model.save_pretrained("./mistral-finetuned")
tokenizer.save_pretrained("./mistral-finetuned")



('./mistral-finetunedi\\tokenizer_config.json',
 './mistral-finetunedi\\special_tokens_map.json',
 './mistral-finetunedi\\tokenizer.json')