In [None]:
!pip install datasets -q

In [None]:
import json
import pprint
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from datasets import load_dataset

import nltk
from nltk import corpus
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud
import matplotlib.pyplot as plt

nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = corpus.stopwords.words('english')

In [None]:
dataset = load_dataset("qiaojin/PubMedQA", "pqa_artificial")
dataset

In [None]:
dataset['train']

## Length of Contexts

In [None]:
pubids = dataset['train']['pubid']
pubids[0]

In [None]:
questions = dataset['train']['question']
questions[0]

In [None]:
# Extract the 'context' column from the dataset
contexts = dataset['train']['context']
contexts[0]

In [None]:
print(json.dumps(contexts[0], indent=4))

In [None]:
## context.keys()

In [None]:
len(context['contexts']),len(context['labels']),len(context['meshes'])

In [None]:
long_answers = dataset['train']['long_answer']
long_answers[0]

In [None]:
final_decisions = dataset['train']['final_decision']
final_decisions[0]

## Word Frequency

In [None]:
# Function to calculate word frequency
def word_frequency(texts):
    words = ' '.join(texts).split()
    return Counter(words)

# Calculate word frequency for 'context' and 'question'
question_word_freq = word_frequency(questions)


In [None]:
# Plot the top 20 most common words in 'question'
question_common_words = question_word_freq.most_common(20)
question_words, question_counts = zip(*question_common_words)

plt.figure(figsize=(12, 6))
sns.barplot(x=list(question_words), y=list(question_counts))
plt.title('Top 20 Most Common Words in Questions')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
;

In [None]:
# Define WordCloud parameters: width, height, and background color
wordcloud = WordCloud(width=800, height=400, background_color='white')

# Generate the word cloud based on word frequencies
wordcloud.generate_from_frequencies(word_freq)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')

## N-grams

In [None]:
text = "I love teaching NLP. N-grams are very useful in text analysis."
tokens = nltk.word_tokenize(text)
print(tokens)

In [None]:
[i for i in ngrams(tokens, 2)]

In [None]:
data = ['apple', 'banana', 'apple', 'orange', 'banana', 'apple']
counter = Counter(data)
print(counter)

In [None]:
# Function to calculate n-grams frequency
def ngrams_frequency(texts, n):
    ngrams_list = []
    for text in texts:
        tokens = word_tokenize(text)
        ngrams_list.extend(ngrams(tokens, n))
    return Counter(ngrams_list)

# Calculate bigrams and trigrams frequency for 'question'
bigrams_freq = ngrams_frequency(questions, 2)

# Plot the top 20 most common bigrams in 'question'
bigrams_common = bigrams_freq.most_common(20)
bigrams, bigrams_counts = zip(*bigrams_common)
bigrams = [' '.join(bigram) for bigram in bigrams]

plt.figure(figsize=(12, 6))
sns.barplot(x=list(bigrams), y=list(bigrams_counts))
plt.title('Top 20 Most Common Bigrams in Questions')
plt.xlabel('Bigrams')
plt.ylabel('Frequency')
plt.xticks(rotation=45)

In [None]:
trigrams_freq = ngrams_frequency(questions, 3)

# Plot the top 20 most common trigrams in 'question'
trigrams_common = trigrams_freq.most_common(20)
trigrams, trigrams_counts = zip(*trigrams_common)
trigrams = [' '.join(trigram) for trigram in trigrams]

plt.figure(figsize=(12, 6))
sns.barplot(x=list(trigrams), y=list(trigrams_counts))
plt.title('Top 20 Most Common Trigrams in Questions')
plt.xlabel('Trigrams')
plt.ylabel('Frequency')
plt.xticks(rotation=45)

## Next word prediction

In [None]:
from collections import defaultdict, Counter
import random

In [None]:
# Function to build a basic word-based model
def build_word_model(texts):
    # Initialize a defaultdict where each value is a Counter
    model = defaultdict(Counter)
    for text in texts:
        # Split the text into words
        words = text.split()
        # Iterate through the words, except the last one
        for i in range(len(words) - 1):
            # Increment the count of the next word in the Counter of the current word
            model[words[i]][words[i + 1]] += 1
    return model

In [None]:
texts = ["hello world", "hello there", "world of code", "hello world of code"]
model = build_word_model(texts)
model

In [None]:
# Function to predict the next word based on the current word
def predict_next_word(current_word, model):
    # Check if the current word is in the model
    if current_word in model:
        # Get the Counter of next words
        next_words = model[current_word]
        # Calculate the total count of all next words
        total_count = sum(next_words.values())
        # Generate a random number between 1 and total_count
        rand_val = random.randint(1, total_count)
        cumulative_count = 0
        # Iterate through the next words and their counts
        for word, count in next_words.items():
            # Add the count to the cumulative count
            cumulative_count += count
            # If the cumulative count is greater than or equal to the random value, return the word
            if cumulative_count >= rand_val:
                return word
    # If the current word is not in the model, return None
    return None

In [None]:
# Example usage of the predict_next_word function
next_word = predict_next_word("hello", model)
print(next_word)  # Output could be "world" or "there"

In [None]:
# Function to generate a sequence of words
def generate_sequence(start_word, model, length):
    # Initialize the sequence with the starting word
    sequence = [start_word]
    # Set the current word to the starting word
    current_word = start_word
    
    # Loop to generate the sequence up to the desired length
    for _ in range(length - 1):
        # Predict the next word based on the current word
        next_word = predict_next_word(current_word, model)
        if next_word:
            # If a next word is found, add it to the sequence
            sequence.append(next_word)
            # Update the current word to the next word
            current_word = next_word
        else:
            # If no next word is found, stop the loop
            break
    
    # Join the list of words into a single string with spaces and return it
    return ' '.join(sequence)

In [None]:
# Example usage of the generate_sequence function
start_word = "hello"
sequence_length = 10
generated_sequence = generate_sequence(start_word, model, sequence_length)
print(generated_sequence)

In [None]:
word_model = build_word_model(questions)

In [None]:
word_model

In [None]:
# Example usage: Predict the next word for a given word in a question
current_word = "What"
next_word = predict_next_word(current_word, word_model)
print(f"The predicted next word for '{current_word}' is '{next_word}'.")