In [1]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm





In [2]:


# Load JSON data
with open('new.json') as file:
    data = json.load(file)

In [3]:
# Extract patterns and corresponding intents
patterns = []
intents = []
for intent in data['intents']:
    for pattern in intent['patterns']:
        patterns.append(pattern)
        intents.append(intent['tag'])

In [4]:
# Encode intents
label_encoder = LabelEncoder()
encoded_intents = label_encoder.fit_transform(intents)


In [5]:
# Split data into training and validation sets
train_patterns, val_patterns, train_intents, val_intents = train_test_split(patterns, encoded_intents, test_size=0.2, random_state=42)


In [6]:
# Tokenize patterns
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_patterns, truncation=True, padding=True)
val_encodings = tokenizer(val_patterns, truncation=True, padding=True)


In [7]:



# Create dataset
class IntentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:

train_dataset = IntentDataset(train_encodings, train_intents)
val_dataset = IntentDataset(val_encodings, val_intents)


In [9]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(intents)))

# Fine-tune the model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optim = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 20
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    epoch_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}"):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        epoch_loss += loss.item()
        loss.backward()
        optim.step()
    print(f"Epoch {epoch+1} Loss: {epoch_loss/len(train_loader):.4f}")

# Evaluate the model
model.eval()
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
total_correct = 0
total_samples = 0
with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        total_correct += (predictions == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f'Validation accuracy: {accuracy:.4f}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20


Training Epoch 1/20:  36%|███▌      | 10/28 [00:29<00:49,  2.77s/it]

In [11]:

model.save_pretrained("fine_tuned_bert_2")
tokenizer.save_pretrained("fine_tuned_bert_2")


('fine_tuned_bert_2\\tokenizer_config.json',
 'fine_tuned_bert_2\\special_tokens_map.json',
 'fine_tuned_bert_2\\vocab.txt',
 'fine_tuned_bert_2\\added_tokens.json')

In [35]:
model = BertForSequenceClassification.from_pretrained('fine_tuned_bert', num_labels=len(set(intents)))
tokenizer = BertTokenizer.from_pretrained('fine_tuned_bert')

In [36]:
def ask_question(question):
    encoded_question = tokenizer(question, truncation=True, padding=True, return_tensors="pt")
    outputs = model(**encoded_question)
    predicted_label = torch.argmax(outputs.logits[0]).item()
    predicted_intent = label_encoder.inverse_transform([predicted_label])[0]

    # Find the intent from the JSON data
    for intent in data['intents']:
        if intent['tag'] == predicted_intent:
            # Randomly select a response from the list of responses
            response = intent['responses']
            return response
        else:
    
            return "Sorry, I couldn't understand your question."

In [41]:
msg = "Contact Details"
response =ask_question(msg)
print(response)

Sorry, I couldn't understand your question.


In [17]:
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm


with open('new_data.json', 'r') as file:
    data = json.load(file)

# Extract text from JSON data
training_text = ""
for intent in data['intents']:
    for pattern in intent['patterns']:
        training_text += pattern + " "

# Ensure the text is converted to lowercase
training_text = training_text.lower()



# Fine-tune GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

max_seq_length = 1024

# Check if training_text exceeds max_seq_length
if len(training_text) > max_seq_length:
    training_text = training_text[:max_seq_length]

# Convert the text to lowercase
training_text = training_text.lower()

# Tokenize the text
input_ids = tokenizer.encode(training_text, return_tensors='pt')

# Truncate input_ids if it exceeds the model's maximum sequence length
if input_ids.size()[1] > tokenizer.model_max_length:
    input_ids = input_ids[:, :tokenizer.model_max_length]

# Create attention mask
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

# Model training loop
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(20):
    optimizer.zero_grad()
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}/{20}, Loss: {loss.item()}")

Epoch 1/20, Loss: 5.53101921081543
Epoch 2/20, Loss: 5.060510635375977
Epoch 3/20, Loss: 4.627333641052246
Epoch 4/20, Loss: 4.280425071716309
Epoch 5/20, Loss: 4.033732891082764
Epoch 6/20, Loss: 3.6480495929718018
Epoch 7/20, Loss: 3.3868675231933594
Epoch 8/20, Loss: 3.086242437362671
Epoch 9/20, Loss: 2.949183940887451
Epoch 10/20, Loss: 2.6545653343200684
Epoch 11/20, Loss: 2.5983669757843018
Epoch 12/20, Loss: 2.2550389766693115
Epoch 13/20, Loss: 2.0575857162475586
Epoch 14/20, Loss: 1.8628835678100586
Epoch 15/20, Loss: 1.613502025604248
Epoch 16/20, Loss: 1.5146889686584473
Epoch 17/20, Loss: 1.30256986618042
Epoch 18/20, Loss: 1.1838409900665283
Epoch 19/20, Loss: 0.9983706474304199
Epoch 20/20, Loss: 0.879201352596283


In [18]:
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")


('fine_tuned_gpt2\\tokenizer_config.json',
 'fine_tuned_gpt2\\special_tokens_map.json',
 'fine_tuned_gpt2\\vocab.json',
 'fine_tuned_gpt2\\merges.txt',
 'fine_tuned_gpt2\\added_tokens.json')

In [19]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# from flask import Flask, render_template, request

# app = Flask(__name__)

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("fine_tuned_gpt2")
model = GPT2LMHeadModel.from_pretrained("fine_tuned_gpt2", pad_token_id=tokenizer.eos_token_id)

# Function to generate response using GPT-2
def generate_response(prompt, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=max_length, truncation=True)
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# @app.route("/")
# def index():
#     return render_template('chat.html')

# @app.route("/get", methods=["GET", "POST"])
# def chat():
#     msg = request.form["msg"]
#     response = generate_response(msg)
#     return response

# if __name__ == '__main__':
#     app.run()


In [24]:
msg = "Property in New delhi"
response =generate_response(msg)
print(response)

Property in New delhi" and "name" in the same file.

In the example above, the "name" is the name of the property in the property description.

In the example above, the "name" is the
