In [30]:
import pandas as pd

In [73]:


# Read the CSV file with specified encoding
df = pd.read_csv(r"sample_data/questions (1).csv", encoding='latin1')

# Now you can proceed with processing the DataFrame
df["Answers"].iloc[0]

"Diabetes is a chronic metabolic disorder characterized by high blood sugar levels resulting from either insufficient insulin production by the pancreas or the body's inability to effectively use insulin. Insulin is a hormone produced by the pancreas that helps regulate blood sugar levels and allows cells to absorb glucose from the bloodstream to use as energy. Without enough insulin or if cells become resistant to insulin, glucose builds up in the bloodstream, leading to various health complications over time."

In [71]:
import torch
from transformers import BertForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, questions, answers, tokenizer):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]

        encoded = self.tokenizer(question, answer, return_tensors='pt')

        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'start_positions': torch.tensor([encoded['start_positions']], dtype=torch.long),
            'end_positions': torch.tensor([encoded['end_positions']], dtype=torch.long)
        }


# Split data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Define batch size
batch_size = 8

# Define data loaders
train_dataset = MyDataset(questions=train_df['Questions'].tolist(), answers=train_df['Answers'].tolist(), tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = MyDataset(questions=val_df['Questions'].tolist(), answers=val_df['Answers'].tolist(), tokenizer=tokenizer)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize BERT model for question answering
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Fine-tuning loop
epochs = 3
criterion = torch.nn.CrossEntropyLoss()
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(epochs):
    total_loss = 0
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].squeeze(1).to(device)
        end_positions = batch['end_positions'].squeeze(1).to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        start_logits, end_logits = outputs.start_logits, outputs.end_logits

        # Compute loss
        loss = criterion(start_logits, start_positions) + criterion(end_logits, end_positions)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    # Compute average training loss
    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for val_batch in val_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            start_positions = val_batch['start_positions'].squeeze(1).to(device)
            end_positions = val_batch['end_positions'].squeeze(1).to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            val_loss = criterion(start_logits, start_positions) + criterion(end_logits, end_positions)

            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    print(f'Epoch {epoch + 1}/{epochs}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')

# Save the fine-tuned model (optional)
torch.save(model.state_dict(), 'fine_tuned_bert.pth')

# Inference (use the fine-tuned model for question answering)
# You can use the same inference code as before, just load the fine-tuned model instead


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyError: 'start_positions'

In [33]:
from transformers import BertForQuestionAnswering

In [None]:
model = BertForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')

In [35]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('deepset/bert-base-cased-squad2')

In [37]:
tokenizer.encode(df["Questions"].iloc[0], truncation=True, padding=True)

[101, 1327, 1110, 17972, 136, 102]

In [38]:
from transformers import pipeline

In [39]:
diabeticBot = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [74]:
with open('sample_data/context.txt', 'r') as file:
    # Read the entire content of the file
    content = file.read()
    # Print the content
    print(content)




In [79]:
diabeticBot({
    "question": df["Questions"].iloc[0],
    "context": all_answers
})

{'score': 0.6826830506324768,
 'start': 10297,
 'end': 10320,
 'answer': 'an autoimmune condition'}

In [61]:
all_answers = ' '.join(df['Answers'])

# Print or use the concatenated string
print(all_answers)

viduals. What one person may perceive as stressful,
 another may not. For this reason, stress is quite hard
 to measure in real-life situations. Artificial measures
 of accepted stress, such as electric shocks or depriva
tion of sleep, are very hard to apply to day-to-day life.
 However, people who report that they are more
 stressed, regardless of the actual nature of the stress
 itself, are more likely to suffer from diabetes. Further
more, it has recently become apparent that measur
able physical and psychological stress, such as that
 caused by sleep deprivation and social stress, is more
 likely to be associated with the presence of diabetes.
 This may in part explain the difference in the fre
quency of diabetes found in people of similar genetic
 background and measurable physical characteristics
 (body weight, amount of exercise, etc.) in different
 regions and societies. Exactly how perceived stress,
 whether physical, social, or psychological, leads to
 diabetes is not yet und

In [62]:
len(all_answers)

81118

In [77]:
content = content[0:5000]

In [85]:
import nltk
nltk.download('punkt')  # Download the necessary tokenizer data

from nltk.tokenize import sent_tokenize

# Tokenize the string into sentences
data = sent_tokenize(all_answers)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [86]:
import torch
from transformers import BertTokenizer, BertModel
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [87]:


# Tokenize and pad preprocessed sentences
tokenize_sentences = [tokenizer.encode(sentence, add_special_tokens=True) for sentence in data]
max_len = max(len(sentence) for sentence in tokenize_sentences)
padded_sentences = [sentence + [tokenizer.pad_token_id] * (max_len - len(sentence)) for sentence in tokenize_sentences]

# Convert tokenized sentences to tensor
input_ids = torch.tensor(padded_sentences)

# Define batch size
batch_size = 1000

# Compute embeddings in batches
num_samples = len(input_ids)
embeddings = []
for start in range(0, num_samples, batch_size):
    end = min(start + batch_size, num_samples)
    batch_input_ids = input_ids[start:end]

    # Create attention mask for the batch
    batch_attention_mask = torch.where(batch_input_ids != tokenizer.pad_token_id, 1, 0)

    # Model inference to get embeddings for the batch
    with torch.no_grad():
        batch_outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        batch_embeddings = batch_outputs.last_hidden_state.mean(dim=1)

    embeddings.append(batch_embeddings)

# Concatenate embeddings from all batches
embeddings = torch.cat(embeddings, dim=0)

In [131]:
from sklearn.metrics.pairwise import cosine_similarity
# Example sentence for similarity comparison
example_sentence = "what are the consequences of diabetes?"



# Tokenize and encode the example sentence
example_encoding = tokenizer.batch_encode_plus(
    [example_sentence],
    padding=True,
    truncation=True,
    return_tensors='pt',
    add_special_tokens=True
)
example_input_ids = example_encoding['input_ids']
example_attention_mask = example_encoding['attention_mask']

# Generate embeddings for the example sentence
with torch.no_grad():
    example_outputs = model(example_input_ids, attention_mask=example_attention_mask)
    example_sentence_embedding = example_outputs.last_hidden_state.mean(dim=1)

# Compute cosine similarity between the original sentence embedding and the example sentence embedding
similarity_score = cosine_similarity(embeddings, example_sentence_embedding)

# Print the similarity score
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.51076204


In [134]:
import numpy as np

# Assuming you have already computed `top_indices` which contains the indices of the top 5 most similar sentences
top_indices = np.argsort(similarity_score.squeeze())[::-1][:50]
# Collect the top 5 similar sentences
top_sentences = [data[idx] for idx in top_indices]

# Combine the top sentences into a single context
context = " ".join(top_sentences)

In [135]:
diabeticBot({
    "question": example_sentence,
    "context": context
})

{'score': 0.7298672199249268,
 'start': 14,
 'end': 68,
 'answer': 'increases the risk of developing other health problems'}

In [125]:
context

"Instilling healthy habits early in life, such as promoting a balanced diet, encouraging regular physical activity, and maintaining a healthy weight, can help reduce the risk of type 2 diabetes in children. A healthy lifestyle that includes regular physical activity and balanced nutrition can improve insulin sensitivity, promote weight loss, lower blood sugar levels, reduce the need for medications, and lower the risk of complications associated with diabetes. It's important to continue monitoring blood sugar levels regularly and maintain a healthy lifestyle to sustain the benefits of weight loss and diabetes management."