# **Natural Language Processing**

Natural Language Processing (NLP) is a field of artificial intelligence that enables computers to understand, interpret, and generate human language

##  1.Text Preprocessing

Text preprocessing transforms raw text into a clean format suitable for analysis

In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [8]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Sample text
text = "Natural Language Processing (NLP) is fascinating! It helps computers understand human language."

### Lowercasing

In [4]:
# Lowercasing
text = text.lower()
print("Lowercase:", text)

Lowercase: natural language processing (nlp) is fascinating! it helps computers understand human language.


### Remove special characters and numbers

In [5]:
# Remove special characters and numbers
text = re.sub(r'[^a-zA-Z\s]', '', text)
print("Without special chars:", text)

Without special chars: natural language processing nlp is fascinating it helps computers understand human language


### Tokenization

In [9]:
# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['natural', 'language', 'processing', 'nlp', 'is', 'fascinating', 'it', 'helps', 'computers', 'understand', 'human', 'language']


### Remove stopwords

In [10]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
print("Without stopwords:", filtered_tokens)

Without stopwords: ['natural', 'language', 'processing', 'nlp', 'fascinating', 'helps', 'computers', 'understand', 'human', 'language']


In [11]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

### Stemming

In [12]:
# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed words:", stemmed_words)

Stemmed words: ['natur', 'languag', 'process', 'nlp', 'fascin', 'help', 'comput', 'understand', 'human', 'languag']


### Lemmatization

In [13]:

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized words:", lemmatized_words)

Lemmatized words: ['natural', 'language', 'processing', 'nlp', 'fascinating', 'help', 'computer', 'understand', 'human', 'language']


## 2.Text Representation

Converting text to numerical formats is essential for machine learning models

In [15]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [17]:
!pip install --upgrade numpy
!pip install --upgrade gensim

Collecting numpy
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.5 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.5 which

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import matplotlib.pyplot as plt

In [2]:
# Sample dataset
texts = [
    "Natural language processing helps computers understand human language",
    "Machine learning models can process text data effectively",
    "Text preprocessing is an essential step in NLP pipelines",
    "Word embeddings capture semantic relationships between words"
]

### Bag of Words (BoW)

In [3]:
# 1. Bag of Words (BoW)
cv = CountVectorizer()
bow_matrix = cv.fit_transform(texts)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=cv.get_feature_names_out())
print("Bag of Words Representation:")
print(bow_df.head())

Bag of Words Representation:
   an  between  can  capture  computers  data  effectively  embeddings  \
0   0        0    0        0          1     0            0           0   
1   0        0    1        0          0     1            1           0   
2   1        0    0        0          0     0            0           0   
3   0        1    0        1          0     0            0           1   

   essential  helps  ...  preprocessing  process  processing  relationships  \
0          0      1  ...              0        0           1              0   
1          0      0  ...              0        1           0              0   
2          1      0  ...              1        0           0              0   
3          0      0  ...              0        0           0              1   

   semantic  step  text  understand  word  words  
0         0     0     0           1     0      0  
1         0     0     1           0     0      0  
2         0     1     1           0     0      0  


### TF-IDF Representation

In [5]:
# 2. TF-IDF Representation
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(texts)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
print("\nTF-IDF Representation:")
print(tfidf_df.head())


TF-IDF Representation:
        an   between       can   capture  computers      data  effectively  \
0  0.00000  0.000000  0.000000  0.000000   0.316228  0.000000     0.000000   
1  0.00000  0.000000  0.362224  0.000000   0.000000  0.362224     0.362224   
2  0.34057  0.000000  0.000000  0.000000   0.000000  0.000000     0.000000   
3  0.00000  0.377964  0.000000  0.377964   0.000000  0.000000     0.000000   

   embeddings  essential     helps  ...  preprocessing   process  processing  \
0    0.000000    0.00000  0.316228  ...        0.00000  0.000000    0.316228   
1    0.000000    0.00000  0.000000  ...        0.00000  0.362224    0.000000   
2    0.000000    0.34057  0.000000  ...        0.34057  0.000000    0.000000   
3    0.377964    0.00000  0.000000  ...        0.00000  0.000000    0.000000   

   relationships  semantic     step      text  understand      word     words  
0       0.000000  0.000000  0.00000  0.000000    0.316228  0.000000  0.000000  
1       0.000000  0.0000

### Word Embeddings (Word2Vec)

In [6]:
# 3. Word Embeddings (Word2Vec)
# Prepare data for Word2Vec
tokenized_texts = [text.lower().split() for text in texts]

In [7]:
tokenized_texts

[['natural',
  'language',
  'processing',
  'helps',
  'computers',
  'understand',
  'human',
  'language'],
 ['machine',
  'learning',
  'models',
  'can',
  'process',
  'text',
  'data',
  'effectively'],
 ['text',
  'preprocessing',
  'is',
  'an',
  'essential',
  'step',
  'in',
  'nlp',
  'pipelines'],
 ['word',
  'embeddings',
  'capture',
  'semantic',
  'relationships',
  'between',
  'words']]

In [8]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=10, window=5, min_count=1, workers=4)
w2v_model

<gensim.models.word2vec.Word2Vec at 0x7ea325b80050>

In [9]:
# Get vector for a word
word = "language"
if word in w2v_model.wv:
    print(f"\nWord2Vec embedding for '{word}':")
    print(w2v_model.wv[word])


Word2Vec embedding for 'language':
[-0.00536309  0.00236467  0.05104123  0.09010639 -0.0930436  -0.07117888
  0.06459852  0.08974349 -0.05016188 -0.03763942]


In [10]:

# Find similar words
print(f"\nSimilar words to 'text':")
try:
    similar_words = w2v_model.wv.most_similar("text", topn=3)
    for word, similarity in similar_words:
        print(f"{word}: {similarity:.4f}")
except KeyError:
    print("Word not in vocabulary or not enough training data")


Similar words to 'text':
language: 0.5436
pipelines: 0.4117
embeddings: 0.4115


In [None]:
import gensim.downloader as api

# Load pre-trained word2vec embeddings
word_vectors = api.load('word2vec-google-news-300')

# Find similar words
similar_words = word_vectors.most_similar('python')
print(similar_words)

# Word analogies
result = word_vectors.most_similar(positive=['king', 'woman'], negative=['man'])
print(f"king - man + woman = {result[0][0]}")  # Should output "queen"

# Visualize embeddings
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

# Select a few words to visualize
words = ['king', 'queen', 'man', 'woman', 'dog', 'cat', 'python', 'java']
word_vectors_list = [word_vectors[word] for word in words]

# Reduce to 2 dimensions for visualization
pca = PCA(n_components=2)
result = pca.fit_transform(word_vectors_list)

# Plot
plt.figure(figsize=(10, 8))
plt.scatter(result[:, 0], result[:, 1], alpha=0.5)
for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.title("Word Embeddings Visualization")
plt.show()

## 3.Text Classification

Text classification assigns predefined categories to text documents

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

In [12]:
data = {
    'text': [
        "I love this product, it's amazing!",
        "Great service and excellent quality",
        "Terrible experience, would not recommend",
        "The worst purchase I've ever made",
        "Absolutely fantastic customer support",
        "Disappointed with the quality of the item",
        "Very happy with my purchase",
        "Product broke after one week",
        "Exceeded all my expectations",
        "Complete waste of money"
    ],
    'sentiment': [1, 1, 0, 0, 1, 0, 1, 0, 1, 0]  # 1: positive, 0: negative
}

In [13]:
df = pd.DataFrame(data)

In [14]:
df.head()

Unnamed: 0,text,sentiment
0,"I love this product, it's amazing!",1
1,Great service and excellent quality,1
2,"Terrible experience, would not recommend",0
3,The worst purchase I've ever made,0
4,Absolutely fantastic customer support,1


In [15]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['sentiment'], test_size=0.3, random_state=42
)

In [16]:
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [17]:
# 1. Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
nb_pred = nb_classifier.predict(X_test_tfidf)
print("Naive Bayes Performance:")
print(f"Accuracy: {accuracy_score(y_test, nb_pred):.4f}")
print(classification_report(y_test, nb_pred))

Naive Bayes Performance:
Accuracy: 0.6667
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



In [18]:
# 2. Logistic Regression
lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(X_train_tfidf, y_train)
lr_pred = lr_classifier.predict(X_test_tfidf)
print("\nLogistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, lr_pred):.4f}")
print(classification_report(y_test, lr_pred))


Logistic Regression Performance:
Accuracy: 0.3333
              precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       0.00      0.00      0.00         2

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
# 3. Support Vector Machine
svm_classifier = LinearSVC(random_state=42)
svm_classifier.fit(X_train_tfidf, y_train)
svm_pred = svm_classifier.predict(X_test_tfidf)
print("\nSVM Performance:")
print(f"Accuracy: {accuracy_score(y_test, svm_pred):.4f}")
print(classification_report(y_test, svm_pred))


SVM Performance:
Accuracy: 0.6667
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



In [20]:
# Classify new texts
new_texts = [
    "I'm really happy with this purchase",
    "This product is horrible and disappointing"
]
new_texts_tfidf = vectorizer.transform(new_texts)
predictions = svm_classifier.predict(new_texts_tfidf)

In [21]:
print("\nNew Text Predictions:")
for text, pred in zip(new_texts, predictions):
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"Text: '{text}' - Prediction: {sentiment}")


New Text Predictions:
Text: 'I'm really happy with this purchase' - Prediction: Positive
Text: 'This product is horrible and disappointing' - Prediction: Positive


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load AG News dataset
from datasets import load_dataset
dataset = load_dataset("ag_news")

# Prepare data
train_texts = dataset["train"]["text"][:10000]  # Using subset for speed
train_labels = dataset["train"]["label"][:10000]
test_texts = dataset["test"]["text"][:1000]
test_labels = dataset["test"]["label"][:1000]

# Create classification pipeline
text_clf = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('classifier', MultinomialNB())
])

# Train the classifier
text_clf.fit(train_texts, train_labels)

# Evaluate
predictions = text_clf.predict(test_texts)
print(classification_report(test_labels, predictions))

# Predict new examples
new_texts = [
    "Oil prices rise due to supply concerns",
    "Scientists discover new species in Amazon rainforest",
    "New smartphone features advanced camera technology",
    "Local team wins championship game in overtime"
]
predictions = text_clf.predict(new_texts)

# Map prediction indices to category names
categories = {0: "World", 1: "Sports", 2: "Business", 3: "Science/Tech"}
for text, pred in zip(new_texts, predictions):
    print(f"Text: {text}")
    print(f"Category: {categories[pred]}\n")

## 4.Named Entity Recognition (NER)

NER identifies entities like names, locations, and organizations in text

In [22]:
import spacy
import nltk
from nltk.chunk import ne_chunk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag


In [28]:
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [24]:
# Sample text
text = "Apple Inc. was founded by Steve Jobs in Cupertino, California. The company released iPhone 14 in September 2022."

In [29]:
# 1. NER with NLTK
print("Named Entity Recognition with NLTK:")
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
entities = ne_chunk(tagged)
print(entities)

Named Entity Recognition with NLTK:
(S
  (PERSON Apple/NNP)
  (ORGANIZATION Inc./NNP)
  was/VBD
  founded/VBN
  by/IN
  (PERSON Steve/NNP Jobs/NNP)
  in/IN
  (GPE Cupertino/NNP)
  ,/,
  (GPE California/NNP)
  ./.
  The/DT
  company/NN
  released/VBD
  (ORGANIZATION iPhone/NN)
  14/CD
  in/IN
  September/NNP
  2022/CD
  ./.)


In [30]:
# 2. NER with spaCy
print("\nNamed Entity Recognition with spaCy:")
try:
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Display entities
    for ent in doc.ents:
        print(f"Entity: {ent.text} - Label: {ent.label_} - Description: {spacy.explain(ent.label_)}")

    # Custom function to display entities in context
    def display_entities(text):
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]

        print(f"\nEntities found in text: {len(entities)}")
        for entity, label in entities:
            print(f"- {entity}: {label}")

        return doc

    # Example with different text
    business_text = "Microsoft announced a partnership with OpenAI in New York last week. CEO Satya Nadella said the deal was worth $10 billion."
    display_entities(business_text)

except ImportError:
    print("To use spaCy, install it with: pip install spacy")
    print("Then download the English model: python -m spacy download en_core_web_sm")


Named Entity Recognition with spaCy:
Entity: Apple Inc. - Label: ORG - Description: Companies, agencies, institutions, etc.
Entity: Steve Jobs - Label: PERSON - Description: People, including fictional
Entity: Cupertino - Label: GPE - Description: Countries, cities, states
Entity: California - Label: GPE - Description: Countries, cities, states
Entity: 14 - Label: CARDINAL - Description: Numerals that do not fall under another type
Entity: September 2022 - Label: DATE - Description: Absolute or relative dates or periods

Entities found in text: 6
- Microsoft: ORG
- OpenAI: ORG
- New York: GPE
- last week: DATE
- Satya Nadella: PERSON
- $10 billion: MONEY


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

# Create NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Sample text
text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."
entities = ner_pipeline(text)

for entity in entities:
    print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Score: {entity['score']:.4f}")

## 5.Sentiment Analysis

Sentiment analysis determines the emotional tone behind text

In [31]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import matplotlib.pyplot as plt

nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [32]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [33]:
# Sample customer reviews
reviews = [
    "This product is amazing! I absolutely love it.",
    "Good quality but a bit expensive for what you get.",
    "Terrible customer service, would not recommend.",
    "The product was okay, nothing special.",
    "I hate this product, complete waste of money!",
    "Decent product but shipping took too long.",
    "Best purchase I've made this year!",
    "Not bad, but I expected more features."
]

In [34]:
# 1. VADER Sentiment Analysis
sia = SentimentIntensityAnalyzer()

vader_results = []
for review in reviews:
    sentiment_scores = sia.polarity_scores(review)
    compound_score = sentiment_scores['compound']

    if compound_score >= 0.05:
        sentiment = "Positive"
    elif compound_score <= -0.05:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    vader_results.append({
        'review': review,
        'compound_score': compound_score,
        'sentiment': sentiment
    })

In [35]:
vader_df = pd.DataFrame(vader_results)
print("VADER Sentiment Analysis:")
print(vader_df[['review', 'compound_score', 'sentiment']])

VADER Sentiment Analysis:
                                              review  compound_score sentiment
0     This product is amazing! I absolutely love it.          0.8620  Positive
1  Good quality but a bit expensive for what you ...          0.2382  Positive
2    Terrible customer service, would not recommend.         -0.6381  Negative
3             The product was okay, nothing special.         -0.0920  Negative
4      I hate this product, complete waste of money!         -0.7777  Negative
5         Decent product but shipping took too long.          0.0000   Neutral
6                 Best purchase I've made this year!          0.6696  Positive
7             Not bad, but I expected more features.          0.2323  Positive


In [36]:
# 2. TextBlob Sentiment Analysis
textblob_results = []
for review in reviews:
    analysis = TextBlob(review)
    polarity = analysis.sentiment.polarity

    if polarity > 0:
        sentiment = "Positive"
    elif polarity < 0:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    textblob_results.append({
        'review': review,
        'polarity': polarity,
        'subjectivity': analysis.sentiment.subjectivity,
        'sentiment': sentiment
    })

In [37]:
textblob_df = pd.DataFrame(textblob_results)
print("\nTextBlob Sentiment Analysis:")
print(textblob_df[['review', 'polarity', 'subjectivity', 'sentiment']])


TextBlob Sentiment Analysis:
                                              review  polarity  subjectivity  \
0     This product is amazing! I absolutely love it.  0.625000      0.750000   
1  Good quality but a bit expensive for what you ...  0.100000      0.650000   
2    Terrible customer service, would not recommend. -1.000000      1.000000   
3             The product was okay, nothing special.  0.428571      0.535714   
4      I hate this product, complete waste of money! -0.316667      0.433333   
5         Decent product but shipping took too long.  0.058333      0.533333   
6                 Best purchase I've made this year!  1.000000      0.300000   
7             Not bad, but I expected more features.  0.250000      0.522222   

  sentiment  
0  Positive  
1  Positive  
2  Negative  
3  Positive  
4  Negative  
5  Positive  
6  Positive  
7  Positive  


In [38]:
# 3. Compare results from both methods
comparison = pd.DataFrame({
    'review': reviews,
    'vader_sentiment': vader_df['sentiment'],
    'textblob_sentiment': textblob_df['sentiment']
})

print("\nComparison of Sentiment Analysis Methods:")
print(comparison)


Comparison of Sentiment Analysis Methods:
                                              review vader_sentiment  \
0     This product is amazing! I absolutely love it.        Positive   
1  Good quality but a bit expensive for what you ...        Positive   
2    Terrible customer service, would not recommend.        Negative   
3             The product was okay, nothing special.        Negative   
4      I hate this product, complete waste of money!        Negative   
5         Decent product but shipping took too long.         Neutral   
6                 Best purchase I've made this year!        Positive   
7             Not bad, but I expected more features.        Positive   

  textblob_sentiment  
0           Positive  
1           Positive  
2           Negative  
3           Positive  
4           Negative  
5           Positive  
6           Positive  
7           Positive  


In [39]:
# Sample code for a real-world application - Analyze Twitter data
def analyze_tweet_sentiment(tweets):
    results = []
    sia = SentimentIntensityAnalyzer()

    for tweet in tweets:
        sentiment_scores = sia.polarity_scores(tweet)
        compound = sentiment_scores['compound']

        if compound >= 0.05:
            sentiment = "Positive"
        elif compound <= -0.05:
            sentiment = "Negative"
        else:
            sentiment = "Neutral"

        results.append({
            'tweet': tweet,
            'compound': compound,
            'sentiment': sentiment
        })

    return pd.DataFrame(results)

In [41]:

# Sample tweets
sample_tweets = [
    "I can't believe how good the new update is! #excited",
    "Service down again for the third time this week. Frustrating!",
    "Just received my order, packaging looks nice.",
    "This company never responds to customer complaints #badservice"
]

tweet_analysis = analyze_tweet_sentiment(sample_tweets)
print("\nTwitter Sentiment Analysis:")
print(tweet_analysis)


Twitter Sentiment Analysis:
                                               tweet  compound sentiment
0  I can't believe how good the new update is! #e...   -0.4015  Negative
1  Service down again for the third time this wee...   -0.4926  Negative
2      Just received my order, packaging looks nice.    0.4215  Positive
3  This company never responds to customer compla...   -0.4019  Negative


In [None]:
from transformers import pipeline
import pandas as pd

# Load sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

# Load IMDb dataset
from datasets import load_dataset
dataset = load_dataset("imdb", split="test").shuffle(seed=42).select(range(100))

# Analyze sentiments
results = []
for review in dataset["text"][:10]:  # Analyze first 10 reviews
    truncated_review = review[:512]  # Most models have token limits
    sentiment = sentiment_analyzer(truncated_review)[0]
    results.append({
        "review_snippet": truncated_review[:100] + "...",
        "sentiment": sentiment["label"],
        "score": sentiment["score"]
    })

# Display results
pd.DataFrame(results)

## 6.Topic Modeling

Topic modeling discovers abstract topics in document collections

In [42]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import matplotlib.pyplot as plt

In [43]:
# Sample dataset of news articles
articles = [
    "The stock market reached record highs today. Investors celebrated as tech stocks soared.",
    "Scientists discover new species in the Amazon rainforest. Biodiversity is crucial for ecosystem health.",
    "The basketball team won the championship after a close game. Fans celebrated the victory.",
    "New climate change report warns of rising sea levels. Global temperatures continue to increase.",
    "Tech company launches new smartphone with advanced AI features. The device will be available next month.",
    "Soccer match ends in dramatic penalty shootout. The underdog team emerged victorious.",
    "Study shows deforestation rates increasing in tropical regions. Conservation efforts are urgent.",
    "Quarterly earnings report exceeds expectations. Company stocks rise by 15 percent.",
    "New medical research shows promising results for cancer treatment. Clinical trials will begin next year.",
    "Tennis player wins grand slam tournament. This is her third major victory this year."
]

In [44]:
# Create document-term matrix
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
dtm = vectorizer.fit_transform(articles)
feature_names = vectorizer.get_feature_names_out()

In [47]:
# Function to display top words for each topic
def display_topics(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append({
            'topic': topic_idx,
            'top_words': top_words
        })
    return pd.DataFrame(topics)

In [48]:
# 1. Latent Dirichlet Allocation (LDA)
n_topics = 3
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(dtm)

print("LDA Topic Modeling:")
lda_topics = display_topics(lda, feature_names, 5)
print(lda_topics)

LDA Topic Modeling:
   topic                                    top_words
0      0  [report, stocks, increase, rising, climate]
1      1          [victory, team, new, company, tech]
2      2         [new, shows, year, treatment, begin]


In [49]:
# Document-topic distribution
lda_document_topics = lda.transform(dtm)
lda_doc_topic_df = pd.DataFrame(lda_document_topics)
lda_doc_topic_df.columns = [f'Topic {i}' for i in range(n_topics)]
lda_doc_topic_df['Dominant Topic'] = lda_doc_topic_df.idxmax(axis=1)
lda_doc_topic_df['Article'] = [a[:50] + "..." for a in articles]  # First 50 chars of each article

print("\nLDA Document-Topic Distribution:")
print(lda_doc_topic_df[['Article', 'Dominant Topic']])


LDA Document-Topic Distribution:
                                             Article Dominant Topic
0  The stock market reached record highs today. I...        Topic 0
1  Scientists discover new species in the Amazon ...        Topic 2
2  The basketball team won the championship after...        Topic 1
3  New climate change report warns of rising sea ...        Topic 0
4  Tech company launches new smartphone with adva...        Topic 1
5  Soccer match ends in dramatic penalty shootout...        Topic 2
6  Study shows deforestation rates increasing in ...        Topic 2
7  Quarterly earnings report exceeds expectations...        Topic 0
8  New medical research shows promising results f...        Topic 2
9  Tennis player wins grand slam tournament. This...        Topic 2


In [50]:
# 2. Non-Negative Matrix Factorization (NMF)
nmf = NMF(n_components=n_topics, random_state=42)
nmf.fit(dtm)

In [51]:
print("\nNMF Topic Modeling:")
nmf_topics = display_topics(nmf, feature_names, 5)
print(nmf_topics)


NMF Topic Modeling:
   topic                                    top_words
0      0           [new, report, sea, change, levels]
1      1  [tech, stocks, celebrated, company, market]
2      2         [shows, year, new, treatment, begin]


In [52]:
# Document-topic distribution for NMF
nmf_document_topics = nmf.transform(dtm)
nmf_doc_topic_df = pd.DataFrame(nmf_document_topics)
nmf_doc_topic_df.columns = [f'Topic {i}' for i in range(n_topics)]
nmf_doc_topic_df['Dominant Topic'] = nmf_doc_topic_df.idxmax(axis=1)
nmf_doc_topic_df['Article'] = [a[:50] + "..." for a in articles]

print("\nNMF Document-Topic Distribution:")
print(nmf_doc_topic_df[['Article', 'Dominant Topic']])


NMF Document-Topic Distribution:
                                             Article Dominant Topic
0  The stock market reached record highs today. I...        Topic 1
1  Scientists discover new species in the Amazon ...        Topic 0
2  The basketball team won the championship after...        Topic 1
3  New climate change report warns of rising sea ...        Topic 0
4  Tech company launches new smartphone with adva...        Topic 1
5  Soccer match ends in dramatic penalty shootout...        Topic 1
6  Study shows deforestation rates increasing in ...        Topic 2
7  Quarterly earnings report exceeds expectations...        Topic 1
8  New medical research shows promising results f...        Topic 2
9  Tennis player wins grand slam tournament. This...        Topic 2


In [53]:

# Interpret the topics (example)
topic_interpretations = {
    "Topic 0": "Finance & Business",
    "Topic 1": "Sports News",
    "Topic 2": "Science & Environment"
}

print("\nTopic Interpretations (based on top words):")
for topic, interpretation in topic_interpretations.items():
    print(f"{topic}: {interpretation}")


Topic Interpretations (based on top words):
Topic 0: Finance & Business
Topic 1: Sports News
Topic 2: Science & Environment


In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='train',
                               remove=('headers', 'footers', 'quotes'),
                               categories=['comp.graphics', 'rec.autos', 'sci.med', 'talk.politics.guns'],
                               random_state=42)

# Extract features with CountVectorizer
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english', max_features=1000)
tf = vectorizer.fit_transform(newsgroups.data)

# Apply LDA
lda = LatentDirichletAllocation(n_components=4, random_state=42)
lda.fit(tf)

# Function to display top words per topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Display results
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, 10)


## 7.Machine Translation

Machine translation converts text from one language to another

In [54]:
import numpy as np
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer

In [55]:
# Sample sentences in English
english_texts = [
    "Hello, how are you doing today?",
    "Natural language processing is a fascinating field of study.",
    "The weather is beautiful outside."
]

In [56]:
def translate_text(texts, source_lang="en", target_lang="fr"):
    """
    Translate text using Hugging Face's MarianMT models

    Args:
        texts: List of texts to translate
        source_lang: Source language code (e.g., 'en' for English)
        target_lang: Target language code (e.g., 'fr' for French)

    Returns:
        List of translated texts
    """
    try:
        # Load model and tokenizer
        model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)

        # Translate
        translated = []
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", padding=True)
            translation = model.generate(**inputs)
            translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)
            translated.append(translated_text)

        return translated

    except Exception as e:
        return [f"Translation error: {e}"] * len(texts)

In [57]:
# Show code example without running it (which would require downloading models)
print("Machine Translation with Hugging Face Transformers:")
print("Note: The following code requires installing transformers and downloading models")

Machine Translation with Hugging Face Transformers:
Note: The following code requires installing transformers and downloading models


In [58]:
# Simulated translations for demonstration
translations = {
    "fr": [
        "Bonjour, comment allez-vous aujourd'hui ?",
        "Le traitement du langage naturel est un domaine d'étude fascinant.",
        "Le temps est magnifique dehors."
    ],
    "es": [
        "Hola, ¿cómo estás hoy?",
        "El procesamiento del lenguaje natural es un campo de estudio fascinante.",
        "El clima está hermoso afuera."
    ],
    "de": [
        "Hallo, wie geht es Ihnen heute?",
        "Natürliche Sprachverarbeitung ist ein faszinierendes Studiengebiet.",
        "Das Wetter ist wunderschön draußen."
    ]
}

In [59]:
# Display example translations
df = pd.DataFrame({"English": english_texts})
for lang, trans in translations.items():
    df[lang.upper()] = trans

print("\nExample Translations:")
print(df)


Example Translations:
                                             English  \
0                    Hello, how are you doing today?   
1  Natural language processing is a fascinating f...   
2                  The weather is beautiful outside.   

                                                  FR  \
0          Bonjour, comment allez-vous aujourd'hui ?   
1  Le traitement du langage naturel est un domain...   
2                    Le temps est magnifique dehors.   

                                                  ES  \
0                             Hola, ¿cómo estás hoy?   
1  El procesamiento del lenguaje natural es un ca...   
2                      El clima está hermoso afuera.   

                                                  DE  
0                    Hallo, wie geht es Ihnen heute?  
1  Natürliche Sprachverarbeitung ist ein faszinie...  
2                Das Wetter ist wunderschön draußen.  


In [60]:
# Simple evaluation of translation quality
print("\nTo evaluate translation quality, you would typically use:")
print("1. BLEU (Bilingual Evaluation Understudy) score")
print("2. METEOR (Metric for Evaluation of Translation with Explicit ORdering)")
print("3. Human evaluation")


To evaluate translation quality, you would typically use:
1. BLEU (Bilingual Evaluation Understudy) score
2. METEOR (Metric for Evaluation of Translation with Explicit ORdering)
3. Human evaluation


In [61]:
# Example of using custom datasets
print("\nUsing parallel corpus for training:")
print("For training custom MT models, you would use parallel datasets like:")
print("- WMT datasets (Conference on Machine Translation)")
print("- OpenSubtitles (movie subtitles)")
print("- TED talks")
print("- Europarl (European Parliament proceedings)")


Using parallel corpus for training:
For training custom MT models, you would use parallel datasets like:
- WMT datasets (Conference on Machine Translation)
- OpenSubtitles (movie subtitles)
- TED talks
- Europarl (European Parliament proceedings)


## 8.Question Answering Systems

Question answering systems extract answers to questions from text

In [62]:
import numpy as np
import pandas as pd
from transformers import pipeline


In [63]:
# Sample context passage
context = """
Natural Language Processing (NLP) is a subfield of artificial intelligence
that focuses on the interaction between computers and human language.
It involves processing and analyzing large amounts of natural language data.
NLP combines computational linguistics, machine learning, and deep learning models
to enable computers to understand, interpret, and generate human language in a valuable way.
The field began in the 1950s, but has seen significant advances since 2010 with the
application of deep learning techniques. Modern NLP applications include machine translation,
sentiment analysis, chatbots, and speech recognition. Companies like Google, Microsoft, and
Amazon heavily invest in NLP research to improve their products and services.
"""

In [64]:
# Sample questions
questions = [
    "What is NLP?",
    "When did NLP begin?",
    "What companies invest in NLP research?",
    "What techniques improved NLP since 2010?",
    "What applications use NLP technology?"
]

In [65]:
def answer_questions(context, questions):
    """
    Use a question answering model to find answers in the context

    Args:
        context: Text passage containing information
        questions: List of questions to answer

    Returns:
        DataFrame with questions, answers, and confidence scores
    """
    try:
        # Initialize QA pipeline
        qa_pipeline = pipeline("question-answering")

        results = []
        for question in questions:
            # Get answer
            answer = qa_pipeline(question=question, context=context)
            results.append({
                'question': question,
                'answer': answer['answer'],
                'confidence': answer['score'],
                'start': answer['start'],
                'end': answer['end']
            })

        return pd.DataFrame(results)

    except Exception as e:
        print(f"Error: {e}")
        print("Note: This code requires the transformers library and model download.")

        # Simulated results for demonstration
        return pd.DataFrame({
            'question': questions,
            'answer': [
                "a subfield of artificial intelligence that focuses on the interaction between computers and human language",
                "in the 1950s",
                "Google, Microsoft, and Amazon",
                "deep learning",
                "machine translation, sentiment analysis, chatbots, and speech recognition"
            ],
            'confidence': np.random.uniform(0.7, 0.95, len(questions))
        })

In [66]:
# Display example without running actual model
print("Question Answering System:")
print("Note: The following example would require downloading models")
print("\nContext passage (excerpt):")
print(context[:150] + "...")

Question Answering System:
Note: The following example would require downloading models

Context passage (excerpt):

Natural Language Processing (NLP) is a subfield of artificial intelligence 
that focuses on the interaction between computers and human language. 
It...


In [67]:
# Get simulated answers
answers_df = answer_questions(context, questions)
print("\nQuestion-Answer Results:")
print(answers_df[['question', 'answer', 'confidence']])

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu



Question-Answer Results:
                                   question  \
0                              What is NLP?   
1                       When did NLP begin?   
2    What companies invest in NLP research?   
3  What techniques improved NLP since 2010?   
4     What applications use NLP technology?   

                                              answer  confidence  
0                        Natural Language Processing    0.831354  
1                                              1950s    0.785927  
2                    Google, Microsoft, and \nAmazon    0.951905  
3                           deep learning techniques    0.538020  
4  machine translation, \nsentiment analysis, cha...    0.911289  


In [68]:
# Building a simple retrieval-based QA system
print("\nBuilding a Simple Retrieval-Based QA System:")
print("1. Create a knowledge base with documents")
print("2. Index documents for efficient retrieval")
print("3. For each question:")
print("   a. Retrieve relevant documents")
print("   b. Extract answer spans from documents")
print("   c. Rank candidate answers")
print("   d. Return best answer")


Building a Simple Retrieval-Based QA System:
1. Create a knowledge base with documents
2. Index documents for efficient retrieval
3. For each question:
   a. Retrieve relevant documents
   b. Extract answer spans from documents
   c. Rank candidate answers
   d. Return best answer


In [69]:

# Example with multiple documents
documents = [
    "NLP techniques include tokenization, part-of-speech tagging, named entity recognition, and parsing.",
    "The Transformer architecture, introduced in 2017, revolutionized NLP with models like BERT and GPT.",
    "Word embeddings represent words as vectors in high-dimensional space, capturing semantic relationships.",
    "Transfer learning in NLP involves pre-training models on large text corpora and fine-tuning for specific tasks."
]

print("\nIn a real application, you would combine document retrieval with extractive QA:")
print("1. First find relevant documents containing the answer")
print("2. Then extract the precise answer span from those documents")


In a real application, you would combine document retrieval with extractive QA:
1. First find relevant documents containing the answer
2. Then extract the precise answer span from those documents


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

# Load model and tokenizer
model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Create QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Example
context = """
Natural Language Processing (NLP) is a field of artificial intelligence
that focuses on the interaction between computers and humans through natural language.
The ultimate goal of NLP is to enable computers to understand, interpret, and generate
human language in a valuable way.
"""

question = "What is the goal of NLP?"
result = qa_pipeline(question=question, context=context)

print(f"Answer: {result['answer']}")
print(f"Score: {result['score']:.4f}")
print(f"Start: {result['start']}, End: {result['end']}")

## 9.Language Generation - check

Language generation creates coherent and contextually relevant text.


In [70]:
import numpy as np
import random
import re
from transformers import pipeline, set_seed

In [71]:
# Set seed for reproducibility
set_seed(42)
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

In [72]:
# 1. Simple Markov Chain Text Generator
def train_markov_model(text, n=2):
    """
    Train a simple Markov chain model for text generation

    Args:
        text: Training text
        n: n-gram size (default: 2)

    Returns:
        Dictionary mapping n-grams to possible next words
    """
    words = text.split()
    model = {}

    for i in range(len(words) - n):
        gram = tuple(words[i:i+n])
        next_word = words[i+n]

        if gram in model:
            model[gram].append(next_word)
        else:
            model[gram] = [next_word]

    return model

In [73]:
def generate_text_markov(model, start_words, n=2, max_length=50):
    """
    Generate text using a Markov chain model

    Args:
        model: Trained Markov model
        start_words: List of words to start generation
        n: n-gram size (default: 2)
        max_length: Maximum length of generated text (default: 50)

    Returns:
        Generated text
    """
    current = tuple(start_words[-n:])
    result = list(current)

    for _ in range(max_length - n):
        if current in model:
            next_word = random.choice(model[current])
            result.append(next_word)
            current = tuple(result[-n:])
        else:
            break

    return ' '.join(result)

In [74]:
# Sample text for Markov chain
sample_text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
concerned with the interactions between computers and human language, in particular how to program computers
to process and analyze large amounts of natural language data. The goal is a computer capable of understanding
the contents of documents, including the contextual nuances of the language within them. The technology can then
accurately extract information and insights contained in the documents as well as categorize and organize the
documents themselves.
"""

In [75]:

# Train Markov model
print("Markov Chain Text Generation:")
markov_model = train_markov_model(sample_text, n=2)
generated_text = generate_text_markov(markov_model, ["Natural", "language"], n=2, max_length=30)
print(generated_text)

Markov Chain Text Generation:
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to


In [76]:
# 2. Hugging Face Transformer-based text generation
def generate_with_transformers(prompt, max_length=50):
    """
    Generate text using Hugging Face transformer models

    Args:
        prompt: Initial text prompt
        max_length: Maximum length of generated text

    Returns:
        Generated text
    """
    try:
        # Initialize text generation pipeline with GPT-2
        generator = pipeline('text-generation', model='gpt2')

        # Generate text
        result = generator(prompt, max_length=max_length, num_return_sequences=1)
        return result[0]['generated_text']

    except Exception as e:
        # Simulated output for demonstration
        print(f"Note: This would require the transformers library and model download.")

        simulated_outputs = {
            "Write a short paragraph about artificial intelligence.":
                "Write a short paragraph about artificial intelligence. Artificial intelligence is transforming how we interact with technology and reshaping various industries. AI systems can now perform tasks that traditionally required human intelligence, such as visual perception, speech recognition, and decision-making. As these technologies continue to advance, they promise to solve complex problems but also raise important ethical considerations about privacy, bias, and the future of work.",

            "Create a product description for a new smartphone.":
                "Create a product description for a new smartphone. Introducing the NextGen X1, the smartphone that redefines innovation. Featuring a stunning 6.7-inch AMOLED display with ProMotion technology, the X1 delivers breathtaking visuals with vibrant colors and deep contrasts. Powered by our latest A16 processor, it handles everything from everyday tasks to intensive gaming with remarkable efficiency. Capture life's moments in extraordinary detail with the revolutionary 108MP camera system equipped with enhanced night mode and 8K video recording capabilities.",

            "Write a recipe for chocolate chip cookies.":
                "Write a recipe for chocolate chip cookies. Classic Chocolate Chip Cookies: Mix 1 cup softened butter with 3/4 cup white sugar and 3/4 cup brown sugar until creamy. Beat in 2 eggs and 2 teaspoons vanilla extract. In another bowl, combine 2 1/4 cups flour, 1 teaspoon baking soda, and 1/2 teaspoon salt. Gradually add dry ingredients to wet mixture. Fold in 2 cups chocolate chips. Drop rounded tablespoons onto ungreased baking sheets. Bake at 375°F for 9-11 minutes until golden brown. Cool on wire racks. Makes about 36 cookies."
        }

        # Return closest match or default text
        for key, value in simulated_outputs.items():
            if prompt in key:
                return value

        return prompt + " AI technology continues to advance rapidly, transforming industries and creating new opportunities. Recent developments in natural language processing have enabled systems to understand and generate human language with impressive accuracy and fluency."


In [78]:

# Example prompts
prompts = [
    "Write a short paragraph about artificial intelligence.",
    "Create a product description for a new smartphone.",
    "Write a recipe for chocolate chip cookies."
]

# Generate text with transformers (simulated)
print("\nTransformer-based Text Generation:")
for prompt in prompts:
    print(f"\nPrompt: {prompt}")
    generated = generate_with_transformers(prompt)
    print(generated)

# 3. Practical applications of text generation
print("\nPractical Applications of Language Generation:")
print("1. Content creation (articles, marketing copy)")
print("2. Chatbots and virtual assistants")
print("3

SyntaxError: unterminated string literal (detected at line 19) (<ipython-input-78-aa87603e75c4>, line 19)

## 10.Word Sense Disambiguation

Word sense disambiguation determines the meaning of words in different contexts

In [79]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
import pandas as pd

In [80]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [81]:
# Sample sentences with ambiguous words
sentences = [
    "The bank is closed today.",
    "He deposited money in the bank.",
    "The river bank was eroding after the flood.",
    "They were fishing by the bank of the lake.",
    "I need to run to catch the bus.",
    "She runs a successful business.",
    "The software will run on any computer.",
    "They went for a morning run in the park."
]

In [82]:
# 1. WordNet-based Word Sense Disambiguation
def get_wordnet_pos(tag):
    """
    Map POS tag to WordNet POS tag
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    else:
        return None

In [84]:
def disambiguate_word(word, sentence, pos=None):
    """
    Disambiguate a word in a sentence using Lesk algorithm
    """
    # Get synset using simplified Lesk algorithm
    synset = lesk(sentence.split(), word, pos)

    if synset:
        return {
            'synset': synset.name(),
            'definition': synset.definition(),
            'examples': synset.examples()
        }
    else:
        return {
            'synset': 'Not found',
            'definition': 'Not available',
            'examples': []
        }



In [None]:
# Analyze ambiguous words in the sentences
results = []

In [85]:
for sentence in sentences:
    # Tokenize and get POS tags
    tokens = nltk.word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)

    # Focus on ambiguous words
    if "bank" in tokens:
        word = "bank"
    elif "run" in tokens or "runs" in tokens:
        word = "run" if "run" in tokens else "runs"
    else:
        continue

    # Get word index and POS
    word_indices = [i for i, token in enumerate(tokens) if token == word or token == "runs"]
    if not word_indices:
        continue

    word_index = word_indices[0]
    word_pos = pos_tags[word_index][1]
    wn_pos = get_wordnet_pos(word_pos)

    # Disambiguate
    disambiguation = disambiguate_word(tokens[word_index], sentence, wn_pos)

    # Add to results
    results.append({
        'sentence': sentence,
        'ambiguous_word': tokens[word_index],
        'pos': word_pos,
        'synset': disambiguation['synset'],
        'definition': disambiguation['definition'],
        'examples': ', '.join(disambiguation['examples']) if disambiguation['examples'] else 'N/A'
    })

In [86]:
# Display results
df_results = pd.DataFrame(results)
print("Word Sense Disambiguation Results:")
print(df_results[['sentence', 'ambiguous_word', 'synset', 'definition']])

# 2. Context-based WSD: Identify sense based on context
def simple_context_wsd(word, context, senses):
    """
    Simple context-based word sense disambiguation
    """
    max_overlap = 0
    best_sense = None

    # Lower case everything for comparison
    context_words = set(context.lower().split())

    for sense_id, sense_def in senses.items():
        # Count overlapping words between context and definition
        definition_words = set(sense_def.lower().split())
        overlap = len(context_words.intersection(definition_words))

        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense_id

    return best_sense, max_overlap

Word Sense Disambiguation Results:
                                      sentence ambiguous_word  \
0                    The bank is closed today.           bank   
1              He deposited money in the bank.           bank   
2  The river bank was eroding after the flood.           bank   
3   They were fishing by the bank of the lake.           bank   
4              I need to run to catch the bus.            run   
5              She runs a successful business.           runs   
6       The software will run on any computer.            run   
7     They went for a morning run in the park.            run   

              synset                                         definition  
0          bank.n.07  a slope in the turn of a road or track; the ou...  
1  savings_bank.n.02  a container (usually with a slot in the top) f...  
2  savings_bank.n.02  a container (usually with a slot in the top) f...  
3          bank.n.09  a building in which the business of banking tr...  
4        

In [87]:
# Example for the word "bank"
bank_senses = {
    "financial": "A financial institution that accepts deposits and channels money into lending activities",
    "river": "The land alongside or sloping down to a river or lake",
    "rely": "To rely on or count on someone or something",
    "building": "A building in which the business of banking is conducted"
}

contexts = [
    "I needed to withdraw money so I went to the bank",
    "The river was flowing rapidly after the heavy rain, overflowing its bank",
    "You can bank on me to help you with your project",
    "The bank closed at 5pm so I couldn't cash my check"
]

print("\nContext-based Word Sense Disambiguation:")
for context in contexts:
    sense, score = simple_context_wsd("bank", context, bank_senses)
    print(f"Context: '{context}'")
    print(f"Detected sense: {sense} (overlap score: {score})")
    print(f"Definition: {bank_senses[sense]}")
    print()


Context-based Word Sense Disambiguation:
Context: 'I needed to withdraw money so I went to the bank'
Detected sense: river (overlap score: 2)
Definition: The land alongside or sloping down to a river or lake

Context: 'The river was flowing rapidly after the heavy rain, overflowing its bank'
Detected sense: river (overlap score: 2)
Definition: The land alongside or sloping down to a river or lake

Context: 'You can bank on me to help you with your project'
Detected sense: rely (overlap score: 2)
Definition: To rely on or count on someone or something

Context: 'The bank closed at 5pm so I couldn't cash my check'
Detected sense: river (overlap score: 1)
Definition: The land alongside or sloping down to a river or lake



In [88]:
# 3. Practical application: Building a WSD system
print("Steps to Build a Word Sense Disambiguation System:")
print("1. Collect sense inventory (e.g., from WordNet)")
print("2. Extract context features:")
print("   - Surrounding words")
print("   - Part-of-speech tags")
print("   - Named entities")
print("   - Semantic roles")
print("3. Train disambiguation model:")
print("   - Knowledge-based (like Lesk)")
print("   - Supervised (classifier with labeled examples)")
print("   - Neural (BERT, RoBERTa, etc.)")
print("4. Evaluate using benchmarks like SemEval")

Steps to Build a Word Sense Disambiguation System:
1. Collect sense inventory (e.g., from WordNet)
2. Extract context features:
   - Surrounding words
   - Part-of-speech tags
   - Named entities
   - Semantic roles
3. Train disambiguation model:
   - Knowledge-based (like Lesk)
   - Supervised (classifier with labeled examples)
   - Neural (BERT, RoBERTa, etc.)
4. Evaluate using benchmarks like SemEval


In [90]:
# 4. WSD with BERT (code example without running)

from transformers import BertTokenizer, BertModel
import torch

# Function to get contextual embeddings
def get_bert_embeddings(sentence, target_word):
    # Load model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Tokenize and get token IDs for the target word
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    target_token_indices = [i for i, token in enumerate(tokens) if target_word in token]

    # Get BERT embeddings
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[0]

    # Get embeddings for target word
    target_embeddings = [embeddings[i] for i in target_token_indices]

    return target_embeddings

# These contextual embeddings would then be used to classify the sense of 'bank'

In [91]:

# Example usage
sentence = "The bank is closed today due to the holiday"
target_word = "bank"
embeddings = get_bert_embeddings(sentence, target_word)
embeddings

[tensor([ 7.6836e-02,  1.2276e-01, -1.9325e-01,  2.0432e-02,  5.5254e-01,
          5.5034e-01, -1.7296e-01,  8.6749e-01, -2.6351e-02, -1.0727e-03,
         -8.5820e-02, -8.0303e-01,  2.9006e-02,  1.3479e+00, -3.0775e-01,
         -4.8827e-02,  5.2482e-01,  3.6622e-02,  8.1655e-02, -2.6390e-01,
         -7.2354e-01,  1.2996e-01, -5.3557e-03,  6.9407e-01,  2.8274e-01,
         -4.4009e-02, -1.2080e-01,  6.3439e-01, -1.8244e-01, -2.4939e-01,
         -1.0432e-01, -3.8516e-01, -4.2460e-01,  4.9058e-01, -2.3124e-01,
         -4.5125e-01, -5.6497e-01, -4.1233e-01, -4.9214e-01,  1.8448e-01,
          2.9292e-01, -7.0596e-02,  6.4871e-01,  1.3303e-01,  1.9299e-01,
         -4.9855e-01,  1.5061e-01, -2.4619e-01, -6.1600e-01, -7.2457e-01,
         -3.8282e-01,  1.2247e-01,  1.7121e-01, -2.4562e-01, -2.9531e-03,
          8.3056e-01, -1.2080e-01,  5.1864e-01,  5.1258e-01, -6.1010e-02,
          2.2625e-01, -2.2943e-01,  4.7527e-01, -3.2192e-01,  1.6663e-01,
         -3.9226e-02, -1.6496e-02,  3.

In [92]:

# 5. Evaluation of WSD systems
print("\nEvaluating WSD Systems:")
print("1. Precision: Correctly disambiguated instances / Total disambiguated instances")
print("2. Recall: Correctly disambiguated instances / Total instances that should be disambiguated")
print("3. F1 Score: Harmonic mean of precision and recall")
print("4. Accuracy: Correctly disambiguated instances / Total instances")
print("5. Benchmarks: SemEval, Senseval, WSD evaluation datasets")


Evaluating WSD Systems:
1. Precision: Correctly disambiguated instances / Total disambiguated instances
2. Recall: Correctly disambiguated instances / Total instances that should be disambiguated
3. F1 Score: Harmonic mean of precision and recall
4. Accuracy: Correctly disambiguated instances / Total instances
5. Benchmarks: SemEval, Senseval, WSD evaluation datasets


In [None]:
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
import nltk
nltk.download('wordnet')
nltk.download('punkt')

# Example sentences with ambiguous words
sentences = [
    "I can hear bass sounds",
    "They caught a large bass in the lake",
    "The crane lifted the heavy materials",
    "A crane flew over the wetlands"
]

# Disambiguate using Lesk algorithm
for sentence in sentences:
    ambiguous_word = "bass" if "bass" in sentence else "crane"
    tokenized = nltk.word_tokenize(sentence)
    synset = lesk(tokenized, ambiguous_word, 'n')

    print(f"Sentence: {sentence}")
    print(f"Ambiguous word: {ambiguous_word}")
    print(f"Detected meaning: {synset.definition() if synset else 'Not found'}")
    print()

## 11.Information Extraction

Information extraction identifies and extracts structured data from unstructured text

In [93]:
import re
import nltk
import pandas as pd
from nltk import ne_chunk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag

nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [94]:
# 1. Named Entity Recognition
def extract_entities(text):
    """
    Extract named entities from text using NLTK
    """
    entities = {
        'PERSON': [],
        'ORGANIZATION': [],
        'GPE': [],  # Geo-Political Entity
        'LOCATION': [],
        'DATE': [],
        'TIME': [],
        'MONEY': [],
        'PERCENT': [],
        'FACILITY': []
    }

    # Tokenize and tag
    sentences = sent_tokenize(text)
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tagged = pos_tag(tokens)

        # Extract named entities
        tree = ne_chunk(tagged)

        # Process tree
        for subtree in tree:
            if isinstance(subtree, nltk.Tree):
                entity_type = subtree.label()
                entity_text = ' '.join([word for word, tag in subtree.leaves()])
                if entity_type in entities:
                    entities[entity_type].append(entity_text)

    # Remove duplicates and return
    for entity_type in entities:
        entities[entity_type] = list(set(entities[entity_type]))

    return entities

In [95]:
# 2. Relation Extraction
def extract_relations(text):
    """
    Simple relation extraction using patterns
    """
    relations = []

    # Simple patterns for relation extraction
    patterns = [
        # Person-Organization patterns (works for)
        (r'([A-Z][a-z]+ [A-Z][a-z]+) works for ([A-Z][a-zA-Z]+)', 'WORKS_FOR'),
        (r'([A-Z][a-z]+ [A-Z][a-z]+) is employed by ([A-Z][a-zA-Z]+)', 'WORKS_FOR'),
        (r'([A-Z][a-z]+ [A-Z][a-z]+) joined ([A-Z][a-zA-Z]+)', 'WORKS_FOR'),

        # Person-Person patterns (reports to)
        (r'([A-Z][a-z]+ [A-Z][a-z]+) reports to ([A-Z][a-z]+ [A-Z][a-z]+)', 'REPORTS_TO'),

        # Organization-Location patterns (based in)
        (r'([A-Z][a-zA-Z]+) is based in ([A-Z][a-z]+)', 'BASED_IN'),
        (r'([A-Z][a-zA-Z]+) headquarters in ([A-Z][a-z]+)', 'BASED_IN'),

        # Person-Role patterns (is a)
        (r'([A-Z][a-z]+ [A-Z][a-z]+) is the ([a-zA-Z]+) of ([A-Z][a-zA-Z]+)', 'HAS_ROLE'),
        (r'([A-Z][a-z]+ [A-Z][a-z]+), the ([a-zA-Z]+) of ([A-Z][a-zA-Z]+)', 'HAS_ROLE')
    ]

    # Apply patterns
    for sentence in sent_tokenize(text):
        for pattern, relation_type in patterns:
            matches = re.findall(pattern, sentence)
            for match in matches:
                if relation_type == 'HAS_ROLE':
                    entity1, role, entity2 = match
                    relations.append({
                        'entity1': entity1,
                        'relation': relation_type,
                        'entity2': entity2,
                        'role': role,
                        'sentence': sentence
                    })
                else:
                    entity1, entity2 = match
                    relations.append({
                        'entity1': entity1,
                        'relation': relation_type,
                        'entity2': entity2,
                        'sentence': sentence
                    })

    return relations

In [96]:
# 3. Event Extraction
def extract_events(text):
    """
    Extract events using verb phrase patterns
    """
    events = []

    # Sample event patterns (based on verb phrases)
    event_verbs = ['announced', 'launched', 'acquired', 'released', 'appointed', 'resigned']

    sentences = sent_tokenize(text)
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tagged = pos_tag(tokens)

        # Look for event trigger verbs
        for i, (word, tag) in enumerate(tagged):
            if word.lower() in event_verbs:
                # Extract surrounding context
                start = max(0, i-5)
                end = min(len(tagged), i+6)
                context = ' '.join([t[0] for t in tagged[start:end]])

                # Extract subject (simplistic approach)
                subject = ""
                for j in range(i-1, -1, -1):
                    if tagged[j][1].startswith('NN'):
                        subject = tagged[j][0]
                        break

                events.append({
                    'event_type': word.upper(),
                    'trigger': word,
                    'subject': subject,
                    'context': context,
                    'sentence': sentence
                })

    return events

In [97]:
# Sample corporate news text
sample_text = """
Apple Inc. is based in Cupertino. The technology giant announced a new iPhone model yesterday.
Tim Cook, the CEO of Apple, presented the product during the annual conference.
Microsoft headquarters in Seattle and employs over 150,000 people worldwide.
Satya Nadella joined Microsoft in 1992 and became CEO in 2014.
Google acquired DeepMind for $500 million in 2014.
Sundar Pichai reports to Alphabet's board of directors.
Facebook released a new virtual reality headset last month.
Tesla is planning to build a new factory in Austin, Texas, which will create 5,000 jobs.
"""

In [98]:
# Extract named entities
print("Named Entity Recognition:")
entities = extract_entities(sample_text)
for entity_type, entity_list in entities.items():
    if entity_list:
        print(f"{entity_type}: {', '.join(entity_list)}")


Named Entity Recognition:
PERSON: Sundar, Satya, Apple, Tim, Microsoft, Facebook, Nadella, Google
ORGANIZATION: iPhone, Inc., Pichai, DeepMind, CEO
GPE: Tesla, Austin, Seattle, Cook, Apple, Microsoft, Texas, Alphabet, Cupertino


In [None]:



# Extract relations
print("\nRelation Extraction:")
relations = extract_relations(sample_text)
relations_df = pd.DataFrame(relations)
if not relations_df.empty:
    print(relations_df[['entity1', 'relation', 'entity2']])
else:
    print("No relations extracted with current patterns.")

# Extract events
print("\nEvent Extraction:")
events = extract_events(sample_text)
events_df = pd.DataFrame(events)
if not events_df.empty:
    print(events_df[['event_type', 'subject', 'context']])
else:
    print("No events extracted with current patterns.")

# 4. Information Extraction with spaCy (code example)
print("\nInformation Extraction with spaCy (example code):")
print("""
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Process text
doc = nlp("Apple Inc. was founded by Steve Jobs in Cupertino, California in 1976.")

# Named Entity Recognition
for ent in doc.ents:
    print(f"Entity: {ent.text}, Type: {ent.label_}, Description: {spacy.explain(ent.label_)}")

# Dependency Parsing for Relation Extraction
for token in doc:
    if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
        subject = token.text
        verb = token.head.text
        for child in token.head.children:
            if child.dep_ == "dobj":
                direct_object = child.text
                print(f"Relation: {subject} - {verb} - {direct_object}")
""")

# 5. Applications of Information Extraction
print("\nApplications of Information Extraction:")
print("1. Knowledge Graph Construction")
print("2. Business Intelligence (extracting insights from news, reports)")
print("3. Resume Parsing (extracting skills, experience)")
print("4. Customer Feedback Analysis")
print("5. Medical Records Analysis (extracting symptoms, diagnoses)")
print("6. Legal Document Processing (extracting clauses, parties, obligations)")
print("7. Competitive Intelligence")
print("8. Automated Data Entry")

# 6. Advanced Information Extraction with Transformers
print("\nAdvanced Information Extraction with Transformers (example):")
print("""
from transformers import pipeline

# Named Entity Recognition
ner_pipeline = pipeline("ner")
ner_results = ner_pipeline("Tim Cook is the CEO of Apple Inc. which is worth $2 trillion.")

# Group tokens belonging to the same entity
grouped_entities = []
current_entity = None
for entity in ner_results:
    if current_entity is None or entity["entity"].startswith("B-"):
        if current_entity is not None:
            grouped_entities.append(current_entity)
        current_entity = {
            "word": entity["word"],
            "entity_group": entity["entity"].split("-")[1],
            "score": entity["score"]
        }
    else:
        current_entity["word"] += entity["word"].replace("##", "")
        current_entity["score"] = (current_entity["score"] + entity["score"]) / 2

if current_entity is not None:
    grouped_entities.append(current_entity)

for entity in grouped_entities:
    print(f"{entity['word']} - {entity['entity_group']} ({entity['score']:.4f})")
""")

# 7. Evaluation metrics for Information Extraction
print("\nEvaluation Metrics for Information Extraction:")
print("1. Precision: Correctly extracted information / All extracted information")
print("2. Recall: Correctly extracted information / All information that should be extracted")
print("3. F1-Score: Harmonic mean of precision and recall")
print("4. Slot Error Rate (SER): Combines insertion, deletion, and substitution errors")







## 12.Chatbots and Dialogue Systems

Chatbots and dialogue systems enable natural language interaction between humans and computers

In [None]:
# Chatbots and Dialogue Systems Example
import re
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Rule-based Chatbot
class RuleBasedChatbot:
    def __init__(self):
        self.rules = [
            {
                "patterns": ["hello", "hi", "hey", "greetings"],
                "responses": ["Hello!", "Hi there!", "Hey! How can I help you?"]
            },
            {
                "patterns": ["how are you", "how are you doing", "how's it going"],
                "responses": ["I'm doing well, thanks for asking!", "I'm fine, how about you?"]
            },
            {
                "patterns": ["what is your name", "who are you", "tell me about yourself"],
                "responses": ["I'm a simple rule-based chatbot.", "My name is ChattyBot!"]
            },
            {
                "patterns": ["bye", "goodbye", "see you", "farewell"],
                "responses": ["Goodbye!", "See you later!", "Have a nice day!"]
            },
            {
                "patterns": ["thank you", "thanks"],
                "responses": ["You're welcome!", "Glad I could help!"]
            }
        ]

        # Default response
        self.default_responses = [
            "I'm not sure I understand.",
            "Could you rephrase that?",
            "I don't have information about that."
        ]

    def match_rule(self, user_message):
        user_message = user_message.lower()

        for rule in self.rules:
            for pattern in rule["patterns"]:
                if pattern in user_message:
                    return random.choice(rule["responses"])

        return random.choice(self.default_responses)

    def respond(self, user_message):
        return self.match_rule(user_message)

# 2. Retrieval-based Chatbot
class RetrievalChatbot:
    def __init__(self, qa_pairs):
        self.qa_pairs = qa_pairs
        self.questions = [pair["question"] for pair in qa_pairs]

        # Create TF-IDF vectorizer
        self.vectorizer = TfidfVectorizer()
        self.question_vectors = self.vectorizer.fit_transform(self.questions)

    def find_best_match(self, user_query, threshold=0.4):
        # Vectorize the user query
        query_vector = self.vectorizer.transform([user_query])

        # Calculate similarities
        similarities = cosine_similarity(query_vector, self.question_vectors)[0]

        # Find best match
        best_match_idx = similarities.argmax()
        best_match_score = similarities[best_match_idx]

        if best_match_score >= threshold:
            return self.qa_pairs[best_match_idx]["answer"], best_match_score
        else:
            return "I'm not sure how to answer that. Could you rephrase your question?", 0.0

    def respond(self, user_message):
        response, confidence = self.find_best_match(user_message)
        return response

# Sample QA pairs for retrieval-based chatbot
qa_pairs = [
    {"question": "What is NLP?", "answer": "Natural Language Processing (NLP) is a field of AI that gives computers the ability to understand text and spoken words in the same way humans can."},
    {"question": "What are the main tasks in NLP?", "answer": "The main tasks in NLP include text classification, named entity recognition, sentiment analysis, machine translation, and question answering."},
    {"question": "How does sentiment analysis work?", "answer": "Sentiment analysis uses NLP techniques to determine the emotional tone behind text. It can classify text as positive, negative, or neutral."},
    {"question": "What is machine translation?", "answer": "Machine translation is the process of automatically translating text from one language to another using AI and NLP techniques."},
    {"question": "What is tokenization?", "answer": "Tokenization is the process of breaking text into smaller pieces called tokens, typically words or subwords."},
    {"question": "What are word embeddings?", "answer": "Word embeddings are vector representations of words that capture semantic meaning, allowing similar words to have similar vector representations."},
    {"question": "What is BERT?", "answer": "BERT (Bidirectional Encoder Representations from Transformers) is a pre-trained language model that has achieved state-of-the-art results on many NLP tasks."},
    {"question": "How do chatbots work?", "answer": "Chatbots work by using NLP techniques to understand user input and generate appropriate responses, either through rule-based systems, retrieval-based methods, or generative models."}
]

# 3. Intent Classification
def classify_intent(user_message):
    """
    Simple intent classification
    """
    intents = {
        "greeting": ["hello", "hi", "hey", "good morning", "greetings"],
        "farewell": ["bye", "goodbye", "see you", "farewell"],
        "information": ["what is", "how does", "explain", "tell me about"],
        "help": ["help", "assist", "support", "guide"],
        "booking": ["book", "reserve", "schedule", "appointment"],
        "complaint": ["problem", "issue", "not working", "complaint", "dissatisfied"]
    }

    user_message = user_message.lower()

    # Calculate intent scores
    intent_scores = {}
    for intent, keywords in intents.items():
        score = sum(1 for keyword in keywords if keyword in user_message)
        intent_scores[intent] = score

    # Get the intent with the highest score
    max_score = max(intent_scores.values())
    if max_score > 0:
        best_intents = [intent for intent, score in intent_scores.items() if score == max_score]
        return random.choice(best_intents)
    else:
        return "unknown"

# 4. Entity Extraction
def extract_entities(user_message):
    """
    Simple pattern-based entity extraction
    """
    entities = {}

    # Date patterns
    date_pattern = r'\b(?:on\s+)?(\d{1,2}(?:st|nd|rd|th)?\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)|tomorrow|today|(?:next|this)\s+(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday))\b'
    date_matches = re.findall(date_pattern, user_message, re.IGNORECASE)
    if date_matches:
        entities['date'] = date_matches[0]

    # Time patterns
    time_pattern = r'\b(?:at\s+)?(\d{1,2}(?::\d{2})?\s*(?:am|pm)?|noon|midnight)\b'
    time_matches = re.findall(time_pattern, user_message, re.IGNORECASE)
    if time_matches:
        entities['time'] = time_matches[0]

    # Location patterns
    location_pattern = r'\b(?:in|at|to)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b'
    location_matches = re.findall(location_pattern, user_message)
    if location_matches:
        entities['location'] = location_matches[0]

    # Numbers
    number_pattern = r'\b(\d+)\b'
    number_matches = re.findall(number_pattern, user_message)

In [None]:
# 4. Entity Extraction (continued)
    if number_matches:
        entities['number'] = number_matches[0]

    return entities

# 5. Dialogue Management with State Tracking
class DialogueManager:
    def __init__(self):
        self.states = {
            "greeting": {
                "required_entities": [],
                "response_templates": [
                    "Hello! How can I help you today?",
                    "Hi there! What can I do for you?"
                ],
                "next_states": ["booking", "information", "help"]
            },
            "booking": {
                "required_entities": ["date", "time", "location"],
                "response_templates": [
                    "I'll book your appointment for {date} at {time} in {location}.",
                    "Your booking is confirmed for {date} at {time} in {location}."
                ],
                "next_states": ["confirmation", "farewell"]
            },
            "information": {
                "required_entities": ["subject"],
                "response_templates": [
                    "Here's what I know about {subject}: ...",
                    "Let me tell you about {subject}: ..."
                ],
                "next_states": ["more_info", "booking", "farewell"]
            },
            "confirmation": {
                "required_entities": [],
                "response_templates": [
                    "Is there anything else you need help with?",
                    "Can I assist you with anything else?"
                ],
                "next_states": ["booking", "information", "farewell"]
            },
            "farewell": {
                "required_entities": [],
                "response_templates": [
                    "Thank you for chatting with me. Have a great day!",
                    "Goodbye! Feel free to come back if you have more questions."
                ],
                "next_states": []
            }
        }

        self.current_state = "greeting"
        self.collected_entities = {}

    def transition(self, user_message):
        # Extract intent
        intent = classify_intent(user_message)

        # Extract entities
        new_entities = extract_entities(user_message)
        self.collected_entities.update(new_entities)

        # Check if intent matches a valid next state
        if intent in self.states[self.current_state]["next_states"]:
            self.current_state = intent

        # Check if we have all required entities for current state
        missing_entities = []
        for entity in self.states[self.current_state]["required_entities"]:
            if entity not in self.collected_entities:
                missing_entities.append(entity)

        # Generate response
        if missing_entities:
            # Ask for missing entities
            entity_to_ask = missing_entities[0]
            return f"Could you please provide the {entity_to_ask}?"
        else:
            # Generate response using template
            template = random.choice(self.states[self.current_state]["response_templates"])
            return template.format(**self.collected_entities)

# 6. Full Dialogue System Example
def simulate_conversation(chatbot_type="rule"):
    """
    Simulate a conversation with a chatbot
    """
    if chatbot_type == "rule":
        chatbot = RuleBasedChatbot()
        print("Rule-based Chatbot Initialized")
    elif chatbot_type == "retrieval":
        chatbot = RetrievalChatbot(qa_pairs)
        print("Retrieval-based Chatbot Initialized")
    elif chatbot_type == "dialogue":
        chatbot = DialogueManager()
        print("Dialogue Manager Initialized")
    else:
        print("Invalid chatbot type")
        return

    print("Bot: Hello! Type 'quit' to exit.")

    conversation_history = []

    while True:
        user_input = input("You: ")
        if user_input.lower() == 'quit':
            print("Bot: Goodbye!")
            break

        if chatbot_type == "dialogue":
            response = chatbot.transition(user_input)
        else:
            response = chatbot.respond(user_input)

        print(f"Bot: {response}")

        # Add to conversation history
        conversation_history.append({"user": user_input, "bot": response})

    return conversation_history

# 7. Intent Classification with ML (example code)
print("Intent Classification with Machine Learning (example code):")
print("""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Training data
training_data = [
    ("Hello there", "greeting"),
    ("Hi, how are you?", "greeting"),
    ("Good morning", "greeting"),
    ("Bye now", "farewell"),
    ("See you later", "farewell"),
    ("Goodbye", "farewell"),
    ("What time do you open?", "information"),
    ("Tell me about your services", "information"),
    ("How much does it cost?", "information"),
    ("I need help with my account", "help"),
    ("Can you assist me?", "help"),
    ("I'm having a problem", "help"),
    ("Book an appointment for tomorrow", "booking"),
    ("I want to schedule a meeting", "booking"),
    ("Reserve a table for two", "booking")
]

# Split into texts and labels
texts, labels = zip(*training_data)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Create pipeline with TF-IDF and SVM
intent_classifier = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LinearSVC())
])

# Train the model
intent_classifier.fit(X_train, y_train)

# Evaluate
predictions = intent_classifier.predict(X_test)
print(classification_report(y_test, predictions))

# Predict intent for new text
new_text = "I want to book a table for dinner tonight"
predicted_intent = intent_classifier.predict([new_text])[0]
print(f"Predicted intent: {predicted_intent}")
""")

# 8. Chatbot Evaluation Metrics
print("\nChatbot Evaluation Metrics:")
print("1. Task Completion Rate: Percentage of tasks successfully completed")
print("2. Turn Correctness: Percentage of system turns that are appropriate")
print("3. Average Conversation Length: Number of turns to complete a task")
print("4. User Satisfaction: User ratings or feedback scores")
print("5. Response Appropriateness: Human judgment of response quality")
print("6. Domain Coverage: Range of topics the bot can handle")
print("7. Error Rate: Percentage of responses with errors")
print("8. Recovery Rate: Ability to recover from misunderstandings")

# 9. Building a Production-Ready Chatbot
print("\nSteps to Build a Production-Ready Chatbot:")
print("1. Define clear user goals and bot capabilities")
print("2. Design conversation flows and dialogue states")
print("3. Implement intent recognition")
print("4. Build entity extraction")
print("5. Create dialogue management system")
print("6. Develop response generation")
print("7. Add context awareness and personalization")
print("8. Implement fallback mechanisms and error handling")
print("9. Integrate with external systems (databases, APIs)")
print("10. Test with real users and continuously improve")
print("11. Monitor performance and user satisfaction")
print("12. Deploy with scalability in mind")

# 10. Natural Language Generation for Responses
def generate_personalized_response(intent, entities, context, user_info):
    """
    Example of personalized response generation
    """
    templates = {
        "greeting": [
            "Hello {user_name}! Welcome back. How can I help you today?",
            "Hi {user_name}! Nice to see you again. What can I do for you?"
        ],
        "booking": [
            "I've booked your {service_type} for {date} at {time}. A confirmation has been sent to {email}.",
            "Your {service_type} is confirmed for {date} at {time}. We'll send details to {email}."
        ],
        "information": [
            "Based on your preferences, here's information about {topic}: ...",
            "Here's what you should know about {topic}, {user_name}: ..."
        ]
    }

    # Select appropriate template
    if intent in templates:
        template = random.choice(templates[intent])

        # Fill in user info
        response = template.format(
            user_name=user_info.get("name", "there"),
            email=user_info.get("email", "your email"),
            **entities,
            **context
        )
        return response
    else:
        return "I'm not sure how to respond to that."

# Example usage
sample_user_info = {
    "name": "Alex",
    "email": "alex@example.com",
    "preferences": ["quick responses", "casual tone"]
}

sample_context = {
    "service_type": "dental appointment",
    "topic": "teeth whitening"
}

sample_entities = {
    "date": "next Monday",
    "time": "2:30pm"
}

print("\nPersonalized Response Examples:")
print(f"Greeting: {generate_personalized_response('greeting', {}, {}, sample_user_info)}")
print(f"Booking: {generate_personalized_response('booking', sample_entities, sample_context, sample_user_info)}")
print(f"Information: {generate_personalized_response('information', {}, sample_context, sample_user_info)}")

## 13.Text Summarization

Text summarization condenses text while preserving key information

In [None]:
# Text Summarization Example
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import networkx as nx

nltk.download('punkt')
nltk.download('stopwords')

# 1. Extractive Summarization with TextRank
def extractive_summarize(text, num_sentences=3):
    """
    Summarize text using TextRank algorithm

    Args:
        text: Text to summarize
        num_sentences: Number of sentences in the summary

    Returns:
        Summary text
    """
    # Preprocessing
    stop_words = set(stopwords.words('english'))
    sentences = sent_tokenize(text)

    # Clean and tokenize sentences
    clean_sentences = []
    original_sentences = []

    for sentence in sentences:
        original_sentences.append(sentence)
        words = [word.lower() for word in word_tokenize(sentence) if word.isalnum()]
        clean_sentences.append([word for word in words if word not in stop_words])

    # Build similarity matrix
    sentence_similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sentence_similarity_matrix[i][j] = sentence_similarity(clean_sentences[i], clean_sentences[j])

    # Create graph and apply PageRank
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Sort sentences by score and select top ones
    ranked_sentences = sorted(((scores[i], i) for i in range(len(sentences))), reverse=True)

    # Get top sentences maintaining original order
    top_sentence_indices = [ranked_sentences[i][1] for i in range(min(num_sentences, len(ranked_sentences)))]
    top_sentence_indices.sort()

    summary = [original_sentences[i] for i in top_sentence_indices]

    return ' '.join(summary)

def sentence_similarity(sent1, sent2):
    """
    Calculate cosine similarity between two sentences
    """
    # Create word vectors
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # Build vectors
    for word in sent1:
        vector1[all_words.index(word)] += 1

    for word in sent2:
        vector2[all_words.index(word)] += 1

    # Compute cosine similarity
    if sum(vector1) == 0 or sum(vector2) == 0:
        return 0.0

    return 1 - cosine_distance(vector1, vector2)

# 2. TF-IDF Based Summarization
def tfidf_summarize(text, num_sentences=3):
    """
    Summarize text using TF-IDF scoring
    """
    from sklearn.feature_extraction.text import TfidfVectorizer

    # Tokenize into sentences
    sentences = sent_tokenize(text)

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')

    # If we have only one sentence, return it
    if len(sentences) <= num_sentences:
        return text

    # Generate TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(sentences)

    # Calculate sentence scores based on TF-IDF values
    sentence_scores = []
    for i, sentence in enumerate(sentences):
        score = np.sum(tfidf_matrix[i].toarray())
        sentence_scores.append((score, i))

    # Sort sentences by score
    sentence_scores.sort(reverse=True)

    # Get top sentences maintaining original order
    top_sentence_indices = [sentence_scores[i][1] for i in range(min(num_sentences, len(sentence_scores)))]
    top_sentence_indices.sort()

    # Combine sentences
    summary = [sentences[i] for i in top_sentence_indices]
    return ' '.join(summary)

# Sample text for summarization
long_article = """
Natural Language Processing (NLP) is a field of artificial intelligence that gives computers the ability to understand text and spoken words in much the same way human beings can. NLP combines computational linguistics—rule-based modeling of human language—with statistical, machine learning, and deep learning models. Together, these technologies enable computers to process human language in the form of text or voice data and to 'understand' its full meaning, complete with the speaker or writer's intent and sentiment.

NLP drives computer programs that translate text from one language to another, respond to spoken commands, and summarize large volumes of text rapidly—even in real time. There's a good chance you've interacted with NLP in the form of voice-operated GPS systems, digital assistants, speech-to-text dictation software, customer service chatbots, and other consumer conveniences. But NLP also plays a growing role in enterprise solutions that help streamline business operations, increase employee productivity, and simplify mission-critical business processes.

The field of NLP has been developing since the 1950s, with roots in linguistics and computer science. The term "Natural Language Processing" was coined in the early 1960s by researchers focused on machine translation. In the early days, most NLP systems were rule-based, with teams of linguists writing formal rules for processing text. By the 1980s, statistical methods began to emerge, allowing computers to learn patterns from large collections of text. The 2010s saw a revolution in NLP with the rise of deep learning techniques, particularly neural networks.

Today, NLP is experiencing unprecedented growth thanks to the development of powerful deep learning models like BERT (Bidirectional Encoder Representations from Transformers) and GPT (Generative Pre-trained Transformer), which have dramatically improved the state-of-the-art across many NLP tasks. These models leverage massive amounts of training data and computational power to achieve remarkable results in language understanding and generation.

Common NLP tasks include sentiment analysis, which identifies the mood or subjective opinions within large amounts of text, including entire social media platforms. This helps companies understand how customers feel about specific products or brands. Named Entity Recognition identifies proper names of people, organizations, and places in text, while summarization condenses large volumes of text while preserving key information and meaning. Machine translation converts text from one language to another, and speech recognition converts spoken language to text.

Despite impressive advances, NLP still faces many challenges. Understanding context, sarcasm, humor, and cultural references remains difficult for machines. Dealing with low-resource languages that have limited available training data is another significant challenge. As NLP systems become more integrated into daily life, concerns about bias in language models, privacy implications, and the ethics of AI-generated content have grown more prominent.

The future of NLP promises greater language understanding capabilities, more natural human-computer interactions, and increasingly personalized experiences. As models continue to grow in size and sophistication, and as techniques for training on fewer examples improve, we can expect NLP to become even more pervasive in our technological landscape.
"""

# Run extractive summarization
print("TextRank Extractive Summarization:")
textrank_summary = extractive_summarize(long_article, num_sentences=3)
print(textrank_summary)

print("\nTF-IDF Based Summarization:")
tfidf_summary = tfidf_summarize(long_article, num_sentences=3)
print(tfidf_summary)

# 3. Abstractive Summarization (using transformers)
def abstractive_summarize(text, max_length=150):
    """
    Abstractive summarization using transformers (code example)
    """
    print("\nAbstractive Summarization with Transformers (code example):")
    print("""
    from transformers import pipeline

    # Initialize summarization pipeline
    summarizer = pipeline("summarization")

    # Generate summary
    summary = summarizer(text, max_length=max_length, min_length=30, do_sample=False)
    return summary[0]["summary_text"]
    """)

    # Simulated output for demonstration
    return "NLP is a field of AI that enables computers to understand human language. It combines linguistics with machine learning models to process text or voice data, understanding meaning and intent. NLP applications include translation, voice commands, and text summarization. Recent advances in deep learning models like BERT and GPT have dramatically improved NLP capabilities."

# Demonstrate abstractive summarization
print("\nAbstractive Summarization (simulated result):")
abstractive_summary = abstractive_summarize(long_article)
print(abstractive_summary)

# 4. Evaluation Metrics for Summarization
print("\nEvaluation Metrics for Text Summarization:")
print("1. ROUGE (Recall-Oriented Understudy for Gisting Evaluation)")
print("   - ROUGE-N: Measures n-gram overlap")
print("   - ROUGE-L: Measures longest common subsequence")
print("   - ROUGE-S: Measures skip-bigram similarity")
print("2. BLEU (Bilingual Evaluation Understudy)")
print("3. BERTScore: Uses BERT embeddings to compute similarity")
print("4. Human evaluation")
print("   - Coherence: Is the summary well-structured and coherent?")
print("   - Informativeness: Does the summary contain the key information?")
print("   - Non-redundancy: Is the summary free of repetition?")
print("   - Grammaticality: Is the summary grammatically correct?")

# Example ROUGE calculation function (simplified)
def calculate_rouge(reference, summary):
    """
    Simplified ROUGE-1 calculation
    """
    # Tokenize
    ref_tokens = set(word_tokenize(reference.lower()))
    sum_tokens = set(word_tokenize(summary.lower()))

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    ref_tokens = [token for token in ref_tokens if token not in stop_words]
    sum_tokens = [token for token in sum_tokens if token not in stop_words]

    # Calculate overlap
    overlap = set(ref_tokens).intersection(set(sum_tokens))

    # Calculate ROUGE-1 Precision, Recall, F1
    precision = len(overlap) / len(sum_tokens) if sum_tokens else 0
    recall = len(overlap) / len(ref_tokens) if ref_tokens else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Example reference summary
reference_summary = "NLP is an AI field enabling computers to understand human language by combining linguistics with machine learning. It powers translation, voice commands, and text summarization. Deep learning advances with models like BERT and GPT have significantly improved NLP capabilities."

# Calculate ROUGE scores for our summaries
print("\nROUGE-1 Scores:")
print("TextRank Summary:")
textrank_rouge = calculate_rouge(reference_summary, textrank_summary)
print(f"Precision: {textrank_rouge['precision']:.4f}, Recall: {textrank_rouge['recall']:.4f}, F1: {textrank_rouge['f1']:.4f}")

print("\nTF-IDF Summary:")
tfidf_rouge = calculate_rouge(reference_summary, tfidf_summary)
print(f"Precision: {tfidf_rouge['precision']:.4f}, Recall: {tfidf_rouge['recall']:.4f}, F1: {tfidf_rouge['f1']:.4f}")

print("\nAbstractive Summary:")
abstractive_rouge = calculate_rouge(reference_summary, abstractive_summary)
print(f"Precision: {abstractive_rouge['precision']:.4f}, Recall: {abstractive_rouge['recall']:.4f}, F1: {abstractive_rouge['f1']:.4f}")

# 5. Applications of Text Summarization
print("\nPractical Applications of Text Summarization:")
print("1. News headline generation")
print("2. Document summarization for legal and medical texts")
print("3. Meeting notes summarization")
print("4. Research paper abstract generation")
print("5. Email summarization")
print("6. Product review summarization")
print("7. Content curation and recommendation")

In [None]:
from transformers import pipeline

# Load summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Sample article
article = """
Researchers have developed a new method for training language models that significantly reduces
computational requirements. The technique, called progressive layer training, gradually unfreezes
layers during the training process. This approach has shown to reduce training time by up to 40%
while maintaining comparable performance to traditional methods. The research team plans to
release their code and trained models to the public next month. This advancement could make
large language model training more accessible to organizations with limited computational resources.
"""

# Generate summary
summary = summarizer(article, max_length=100, min_length=30, do_sample=False)
print(summary[0]['summary_text'])

## 14.Neural Language Models

Neural language models predict the probability of sequences of words and form the foundation of modern NLP.


## 15.Transformers Architecture

Transformers revolutionized NLP with their self-attention mechanism, enabling parallel processing and better handling of long-range dependencies

In [1]:
import torch
from transformers import BertModel, BertTokenizer

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Prepare text input
text = "Transformers have revolutionized NLP."
inputs = tokenizer(text, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Access last hidden states (contextual embeddings)
last_hidden_states = outputs.last_hidden_state
print(f"Shape of embeddings: {last_hidden_states.shape}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Shape of embeddings: torch.Size([1, 9, 768])


## Transfer Learning in NLP

Transfer learning involves fine-tuning pre-trained models on specific tasks

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score

# Load IMDB dataset
dataset = load_dataset("imdb")
train_dataset = dataset["train"].shuffle(seed=42).select(range(5000))
test_dataset = dataset["test"].shuffle(seed=42).select(range(1000))

# Tokenize dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Load model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
)

# Define metric function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

## Text Generation

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Generate text
prompt = "Natural language processing is"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate
output = model.generate(
    input_ids,
    max_length=100,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
    do_sample=True
)

# Decode and print
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

## Document Similarity

Document similarity measures how closely related two texts are

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample documents
documents = [
    "Machine learning models learn from data",
    "Deep learning is a subset of machine learning",
    "Natural language processing analyzes text",
    "NLP applications include translation and summarization",
    "Machine learning algorithms improve with more data"
]

# Convert to TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a heatmap of similarities
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.heatmap(cosine_sim, annot=True, cmap='YlGnBu', xticklabels=range(1, len(documents)+1),
            yticklabels=range(1, len(documents)+1))
plt.title('Document Similarity Matrix')
plt.xlabel('Document Number')
plt.ylabel('Document Number')
plt.tight_layout()
plt.show()

## Aspect-Based Sentiment Analysis

Aspect-based sentiment analysis determines sentiment toward specific aspects in text

In [None]:
from transformers import pipeline
from datasets import load_dataset

# Load aspect-based sentiment analysis model
absa = pipeline("text-classification", model="yangheng/deberta-v3-base-absa-v1.1")

# Sample reviews
reviews = [
    "The food was delicious but the service was slow.",
    "The battery life on this phone is excellent, but the camera quality is disappointing.",
    "The hotel room was spacious and clean, though the Wi-Fi connection was unstable."
]

# Extract aspects and sentiments
def extract_aspect_sentiments(review):
    # In real applications, you'd use a more sophisticated aspect extraction
    # This is a simplified example
    aspects = []
    if "food" in review.lower():
        aspects.append("food")
    if "service" in review.lower():
        aspects.append("service")
    if "battery" in review.lower():
        aspects.append("battery")
    if "camera" in review.lower():
        aspects.append("camera")
    if "room" in review.lower():
        aspects.append("room")
    if "wi-fi" in review.lower() or "wifi" in review.lower():
        aspects.append("wifi")

    results = []
    for aspect in aspects:
        # Create targeted input for the aspect
        input_text = f"{review} [ASP] {aspect} [ASP]"
        sentiment = absa(input_text)[0]
        results.append({
            "aspect": aspect,
            "sentiment": sentiment["label"],
            "confidence": sentiment["score"]
        })
    return results

# Process each review
for i, review in enumerate(reviews):
    print(f"Review {i+1}: {review}")
    results = extract_aspect_sentiments(review)
    for result in results:
        print(f"  Aspect: {result['aspect']}, Sentiment: {result['sentiment']}, Confidence: {result['confidence']:.4f}")
    print()

## Contextual Embeddings

Contextual embeddings represent words based on their context in a sentence.

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Example sentences with polysemous words
sentences = [
    "The bank of the river was muddy after the rain.",
    "I need to go to the bank to deposit my check.",
    "The pilot had to bank the airplane to avoid the storm.",
    "I will bank on your support for this project."
]

# Get embeddings
embeddings = []
for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)

    # Use [CLS] token embedding as sentence representation
    embeddings.append(outputs.last_hidden_state[0][0].numpy())

# Reduce dimensionality for visualization
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Plot
plt.figure(figsize=(10, 8))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], marker='o', s=100)

# Add labels
for i, sentence in enumerate(sentences):
    plt.annotate(f"Sentence {i+1}",
                 (reduced_embeddings[i, 0], reduced_embeddings[i, 1]),
                 xytext=(5, 5),
                 textcoords='offset points')

plt.title("Contextual Embeddings of 'Bank' in Different Contexts")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()

## Dependency Parsing

Dependency parsing analyzes the grammatical structure of a sentence based on word dependencies.


In [None]:
import spacy
import pandas as pd
from spacy import displacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Example sentences
sentences = [
    "The cat chased the mouse.",
    "Students who study regularly pass exams easily.",
    "Despite the rain, the event was successful."
]

for sentence in sentences:
    doc = nlp(sentence)

    # Extract dependencies
    dependencies = []
    for token in doc:
        dependencies.append({
            "token": token.text,
            "dependency": token.dep_,
            "head_token": token.head.text,
            "children": [child.text for child in token.children]
        })

    print(f"Sentence: {sentence}")
    df = pd.DataFrame(dependencies)
    print(df)
    print()

    # Visualize dependency parse tree (in a notebook environment)
    # displacy.render(doc, style="dep", jupyter=True)

    # For non-notebook environment, you can save as HTML
    html = displacy.render(doc, style="dep", page=True)
    with open(f"dependency_parse_{sentences.index(sentence)}.html", "w", encoding="utf-8") as f:
        f.write(html)

## Coreference Resolution

Coreference resolution identifies when different expressions refer to the same entity.


In [None]:
import spacy
import neuralcoref
import pandas as pd

# Load model and add neuralcoref
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)

# Example texts
texts = [
    "John called Mike yesterday. He wanted to discuss the project.",
    "The company released its annual report. It showed significant growth.",
    "Sandra told Mary that she had won the competition."
]

for text in texts:
    doc = nlp(text)

    print(f"Text: {text}")
    print(f"Coreferences: {doc._.coref_clusters}")

    # Show resolved text
    print(f"Resolved: {doc._.coref_resolved}")
    print()

    # Extract all mentions and their references
    if doc._.coref_clusters:
        mentions = []
        for cluster in doc._.coref_clusters:
            main_mention = cluster.main.text
            for mention in cluster.mentions:
                mentions.append({
                    "mention": mention.text,
                    "refers_to": main_mention,
                    "start": mention.start,
                    "end": mention.end
                })

        print(pd.DataFrame(mentions))
        print()

## Text Readability Analysis

Text readability analysis evaluates how easy or difficult a text is to read

In [None]:
import textstat
import pandas as pd
import matplotlib.pyplot as plt

# Example texts of varying complexity
texts = [
    "The cat sat on the mat. It was happy.",
    "The cardiovascular system consists of the heart, blood vessels, and blood. Its primary function is to transport oxygen, nutrients, hormones, and cellular waste products throughout the body.",
    "Quantum mechanics is a fundamental theory in physics that provides a description of the physical properties of nature at the scale of atoms and subatomic particles. It is the foundation of all quantum physics including quantum chemistry, quantum field theory, quantum technology, and quantum information science."
]

# Calculate readability metrics
results = []
for i, text in enumerate(texts):
    result = {
        "text_id": i+1,
        "flesch_reading_ease": textstat.flesch_reading_ease(text),
        "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
        "gunning_fog": textstat.gunning_fog(text),
        "smog_index": textstat.smog_index(text),
        "coleman_liau_index": textstat.coleman_liau_index(text),
        "automated_readability_index": textstat.automated_readability_index(text),
        "dale_chall_readability_score": textstat.dale_chall_readability_score(text),
        "syllable_count": textstat.syllable_count(text),
        "lexicon_count": textstat.lexicon_count(text),
        "sentence_count": textstat.sentence_count(text),
        "avg_sentence_length": textstat.avg_sentence_length(text),
        "avg_syllables_per_word": textstat.avg_syllables_per_word(text),
    }
    results.append(result)

# Display results
df = pd.DataFrame(results)
print(df[["text_id", "flesch_reading_ease", "flesch_kincaid_grade", "gunning_fog"]])

# Plot readability scores
plt.figure(figsize=(10, 6))
metrics = ["flesch_reading_ease", "flesch_kincaid_grade", "gunning_fog",
           "smog_index", "coleman_liau_index", "automated_readability_index"]

for i, text_id in enumerate(df["text_id"]):
    plt.plot(metrics, df.loc[df["text_id"] == text_id, metrics].values[0], marker='o', label=f"Text {text_id}")

plt.xlabel("Readability Metrics")
plt.ylabel("Score")
plt.title("Readability Scores Comparison")
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## Text Summarization with Extractive Methods

Extractive summarization identifies and extracts important sentences from the original text

In [None]:
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel
import torch

nltk.download('punkt')
nltk.download('stopwords')

def get_sentence_embeddings(sentences, model, tokenizer):
    embeddings = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)

        # Use mean pooling to get sentence embedding
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())

    return np.array(embeddings)

def textrank_summarize(text, num_sentences=3):
    # Load models
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

    # Split text into sentences
    sentences = sent_tokenize(text)

    if len(sentences) <= num_sentences:
        return text

    # Get sentence embeddings
    embeddings = get_sentence_embeddings(sentences, model, tokenizer)

    # Calculate similarity matrix
    sim_matrix = cosine_similarity(embeddings)

    # Create graph and apply PageRank
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    # Get top sentences
    ranked_sentences = sorted(((scores[i], i, s) for i, s in enumerate(sentences)), reverse=True)

    # Sort selected sentences by original position
    selected_indices = [item[1] for item in ranked_sentences[:num_sentences]]
    selected_indices.sort()

    # Join selected sentences
    summary = ' '.join([sentences[i] for i in selected_indices])

    return summary

# Example text for summarization
article = """
Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language.
The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language in a valuable way.
NLP has many applications, including machine translation, sentiment analysis, speech recognition, and question answering systems.
Modern NLP techniques rely heavily on machine learning, especially deep learning models like transformers.
Transfer learning has revolutionized NLP by allowing researchers to fine-tune pre-trained models for specific tasks.
BERT, GPT, and other transformer-based models have achieved state-of-the-art results on various NLP benchmarks.
Despite significant progress, NLP still faces challenges such as understanding context, sarcasm, and ambiguity in language.
The field continues to evolve rapidly, with new models and techniques being developed to address these challenges.
Multimodal learning, which combines text with other data types like images and audio, is an emerging trend in NLP research.
As NLP technologies improve, they are becoming increasingly integrated into our daily lives through virtual assistants, chatbots, and other applications.
"""

# Generate summary
summary = textrank_summarize(article, num_sentences=3)
print("Original article length:", len(article.split()))
print("Summary length:", len(summary.split()))
print("\nSummary:")
print(summary)

## Language Detection

Language detection identifies the language of a given text.


In [None]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langcodes import Language

# Set seed for consistent results
DetectorFactory.seed = 42

# Sample texts in different languages
texts = [
    "Natural language processing is a field of artificial intelligence.",  # English
    "Le traitement du langage naturel est un domaine de l'intelligence artificielle.",  # French
    "El procesamiento del lenguaje natural es un campo de la inteligencia artificial.",  # Spanish
    "Die Verarbeitung natürlicher Sprache ist ein Teilgebiet der künstlichen Intelligenz.",  # German
    "自然言語処理は人工知能の分野です。",  # Japanese
    "自然语言处理是人工智能的一个领域。",  # Chinese
    "Обработка естественного языка — это область искусственного интеллекта."  # Russian
]

# Detect languages
results = []
for text in texts:
    try:
        lang_code = detect(text)
        lang_name = Language.make(language=lang_code).display_name()
        confidence = 'N/A'  # langdetect doesn't provide confidence scores directly
    except:
        lang_code = 'unknown'
        lang_name = 'Unknown'
        confidence = 'N/A'

    results.append({
        'text': text[:50] + '...' if len(text) > 50 else text,
        'language_code': lang_code,
        'language_name': lang_name,
        'confidence': confidence
    })

# Display results
pd.DataFrame(results)

## Text Augmentation

Text augmentation generates variations of existing text data to expand training datasets.

In [None]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import pandas as pd

# Original sentences
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Natural language processing is fascinating.",
    "Deep learning models require large amounts of data."
]

# Initialize augmenters
synonym_aug = naw.SynonymAug(aug_src='wordnet')
insert_aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")
swap_aug = naw.RandomWordAug(action="swap")
delete_aug = naw.RandomWordAug(action="delete")
spelling_aug = nac.KeyboardAug()

# Apply augmentations
results = []
for sentence in sentences:
    augmented = {
        'original': sentence,
        'synonym_replace': synonym_aug.augment(sentence)[0],
        'word_insert': insert_aug.augment(sentence)[0],
        'word_swap': swap_aug.augment(sentence)[0],
        'word_delete': delete_aug.augment(sentence)[0],
        'spelling_error': spelling_aug.augment(sentence)[0]
    }
    results.append(augmented)

# Display results
pd.DataFrame(results)

## Multi-Label Text Classification

Multi-label classification assigns multiple categories to a single document

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, hamming_loss
from sklearn.model_selection import train_test_split

# Generate sample dataset
texts = [
    "The movie had great acting but poor special effects",
    "The camera and battery life of this phone are excellent",
    "The hotel had a beautiful view and friendly staff",
    "The restaurant's food was delicious but the service was terrible",
    "The laptop has a fast processor but short battery life",
    "The car has good fuel economy and excellent handling",
    "The book had an engaging plot but weak character development",
    "The concert venue had great acoustics but uncomfortable seating",
    "The smartphone has a stunning display and impressive camera",
    "The coffee shop has delicious pastries and a cozy atmosphere",
    "The gym has modern equipment but limited space",
    "The hiking trail offers beautiful views and challenging terrain",
    "The app has an intuitive interface but frequent crashes",
    "The headphones have excellent sound quality but poor comfort",
    "The smartwatch has accurate fitness tracking and long battery life"
]

# Create multi-label dataset with 4 categories: Product, Service, Location, Entertainment
# 1 indicates presence of the category
labels = np.array([
    [0, 1, 0, 1],  # movie: service, entertainment
    [1, 0, 0, 0],  # phone: product
    [0, 1, 1, 0],  # hotel: service, location
    [0, 1, 1, 0],  # restaurant: service, location
    [1, 0, 0, 0],  # laptop: product
    [1, 0, 0, 0],  # car: product
    [0, 0, 0, 1],  # book: entertainment
    [0, 0, 1, 1],  # concert venue: location, entertainment
    [1, 0, 0, 0],  # smartphone: product
    [0, 1, 1, 0],  # coffee shop: service, location
    [0, 0, 1, 0],  # gym: location
    [0, 0, 1, 0],  # hiking trail: location
    [1, 0, 0, 0],  # app: product
    [1, 0, 0, 0],  # headphones: product
    [1, 0, 0, 0]   # smartwatch: product
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

# Convert text to features
vectorizer = CountVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train multi-label classifier
forest = RandomForestClassifier(n_estimators=100, random_state=42)
multi_target_forest = MultiOutputClassifier(forest)
multi_target_forest.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = multi_target_forest.predict(X_test_vec)

print("Hamming loss:", hamming_loss(y_test, y_pred))
print("\nClassification report:")

# Categories
categories = ['Product', 'Service', 'Location', 'Entertainment']

# Per-class evaluation
for i, category in enumerate(categories):
    print(f"\nCategory: {category}")
    print(classification_report(y_test[:, i], y_pred[:, i], zero_division=0))

# Predict new examples
new_texts = [
    "This new tablet has a fast processor and beautiful display",
    "The resort had amazing views and exceptional dining options",
    "The movie had an excellent plot and outstanding performances"
]

new_X_vec = vectorizer.transform(new_texts)
new_predictions = multi_target_forest.predict(new_X_vec)

# Display predictions
for text, pred in zip(new_texts, new_predictions):
    print(f"\nText: {text}")
    print("Categories:", [categories[i] for i in range(len(categories)) if pred[i] == 1])

## Keyword Extraction

Keyword extraction identifies the most important terms in a document

In [None]:
import yake
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

nltk.download('stopwords')
nltk.download('punkt')

# Example documents
documents = [
    """Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction
    between computers and humans through natural language. The ultimate goal of NLP is to enable computers
    to understand, interpret, and generate human language in a valuable way.""",

    """Machine learning is an application of artificial intelligence that provides systems the ability
    to automatically learn and improve from experience without being explicitly programmed.
    Machine learning focuses on the development of computer programs that can access data and use it to learn for themselves.""",

    """Computer vision is a field of artificial intelligence that trains computers to interpret and understand
    the visual world. Using digital images from cameras and videos and deep learning models,
    machines can accurately identify and classify objects and react to what they "see"."""
]

# Method 1: YAKE
def extract_keywords_yake(text, num_keywords=5):
    kw_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, dedupFunc='seqm', windowsSize=1, top=num_keywords)
    keywords = kw_extractor.extract_keywords(text)
    return [(kw[0], round(1-kw[1], 3)) for kw in keywords]  # Convert score for readability

# Method 2: TF-IDF
def extract_keywords_tfidf(documents, num_keywords=5):
    vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
    X = vectorizer.fit_transform(documents)

    # Get feature names
    feature_names = vectorizer.get_feature_names_out()

    keywords_by_doc = []
    for i in range(len(documents)):
        # Get TF-IDF scores for this document
        tfidf_scores = X[i].toarray()[0]

        # Create (word, score) pairs and sort by score
        word_scores = [(feature_names[j], tfidf_scores[j]) for j in range(len(feature_names)) if tfidf_scores[j] > 0]
        word_scores = sorted(word_scores, key=lambda x: x[1], reverse=True)

        # Get top keywords
        keywords_by_doc.append(word_scores[:num_keywords])

    return keywords_by_doc

# Method 3: TextRank-inspired (simplified)
def extract_keywords_textrank(text, num_keywords=5):
    # Preprocess
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    # Tokenize and filter
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word not in stop_words and word not in punctuation and len(word) > 1]

    # Create co-occurrence matrix (window size = 2)
    word_index = {word: i for i, word in enumerate(set(filtered_tokens))}
    index_word = {i: word for word, i in word_index.items()}
    n = len(word_index)
    matrix = np.zeros((n, n))

    window_size = 2
    for i in range(len(filtered_tokens)):
        for j in range(i+1, min(i+window_size+1, len(filtered_tokens))):
            if filtered_tokens[i] in word_index and filtered_tokens[j] in word_index:
                idx1 = word_index[filtered_tokens[i]]
                idx2 = word_index[filtered_tokens[j]]
                matrix[idx1][idx2] += 1
                matrix[idx2][idx1] += 1  # Symmetric

    # Apply PageRank
    nx_graph = nx.from_numpy_array(matrix)
    scores = nx.pagerank(nx_graph)

    # Convert scores to (word, score) pairs and sort
    word_scores = [(index_word[idx], score) for idx, score in scores.items()]
    word_scores = sorted(word_scores, key=lambda x: x[1], reverse=True)

    return word_scores[:num_keywords]

# Extract keywords using different methods
results = []
for i, doc in enumerate(documents):
    doc_results = {
        "document_id": i+1,
        "document_preview": doc[:100] + "...",
        "yake_keywords": extract_keywords_yake(doc),
        "textrank_keywords": extract_keywords_textrank(doc)
    }
    results.append(doc_results)

# Add TF-IDF results
tfidf_keywords = extract_keywords_tfidf(documents)
for i in range(len(results)):
    results[i]["tfidf_keywords"] = tfidf_keywords[i]

# Display results
for result in results:
    print(f"Document {result['document_id']}: {result['document_preview']}")
    print("YAKE keywords:", result['yake_keywords'])
    print("TF-IDF keywords:", result['tfidf_keywords'])
    print("TextRank keywords:", result['textrank_keywords'])
    print()

## Cross-Lingual Transfer Learning

Cross-lingual transfer learning leverages knowledge from one language to improve performance in another language

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load multilingual BERT model and tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Load English sentiment dataset for training
dataset = load_dataset("imdb", split="train").shuffle(seed=42).select(range(2000))
test_dataset = load_dataset("imdb", split="test").shuffle(seed=42).select(range(500))

# Tokenize English data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Prepare Spanish test examples (small sample for illustration)
spanish_texts = [
    "Esta película fue maravillosa. Me encantó cada momento.",  # Positive
    "El actor principal fue excelente en su papel.",  # Positive
    "Esta película fue terrible. No me gustó nada.",  # Negative
    "La trama fue aburrida y los personajes mal desarrollados."  # Negative
]
spanish_labels = [1, 1, 0, 0]  # 1 for positive, 0 for negative

# Tokenize Spanish texts
spanish_encodings = tokenizer(spanish_texts, padding="max_length", truncation=True, max_length=128)
spanish_dataset = {
    "input_ids": spanish_encodings["input_ids"],
    "attention_mask": spanish_encodings["attention_mask"],
    "labels": spanish_labels
}

# Define evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Setup training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir="./logs",
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

# Fine-tune model on English data
trainer.train()

# Evaluate on English test set
english_results = trainer.evaluate()
print("English evaluation results:", english_results)

# Evaluate on Spanish examples (zero-shot cross-lingual transfer)
from torch.utils.data import Dataset

class SimpleDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create Spanish dataset
spanish_eval_dataset = SimpleDataset(
    {"input_ids": spanish_encodings["input_ids"],
     "attention_mask": spanish_encodings["attention_mask"]},
    spanish_labels
)

# Evaluate on Spanish data
spanish_trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
)

spanish_results = spanish_trainer.evaluate(spanish_eval_dataset)
print("Spanish evaluation results (zero-shot transfer):", spanish_results)

## Text Style Transfer

Text style transfer transforms text from one style (formal, informal, etc.) to another while preserving content.


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Example function for simple formality transfer using T5
def transfer_formality(text, target_style):
    prefix = f"transfer to {target_style}: "
    input_text = prefix + text

    # Tokenize and generate
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(
        input_ids=input_ids,
        max_length=150,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    # Decode output
    transferred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return transferred_text

# Example texts
texts = [
    "Hey what's up? Can u help me with this problem?",
    "I ain't got no time for this stuff!",
    "Dear Sir/Madam, I am writing to express my sincere gratitude.",
    "The aforementioned document requires your immediate attention."
]

# Transfer examples
results = []
for text in texts:
    formal = transfer_formality(text, "formal")
    informal = transfer_formality(text, "informal")

    results.append({
        "original": text,
        "to_formal": formal,
        "to_informal": informal
    })

# Display results
for result in results:
    print(f"Original: {result['original']}")
    print(f"Formal: {result['to_formal']}")
    print(f"Informal: {result['to_informal']}")
    print()

## Knowledge Graphs from Text

Knowledge graphs extract structured relationships from unstructured text

In [None]:
import spacy
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Example texts about technology companies
texts = [
    "Microsoft was founded by Bill Gates and Paul Allen in 1975. The company is headquartered in Redmond, Washington.",
    "Apple Inc. was established by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976. Apple is based in Cupertino, California.",
    "Google was founded by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University.",
    "Facebook was created by Mark Zuckerberg at Harvard University in 2004. The company is headquartered in Menlo Park, California."
]

# Extract entities and relationships
def extract_relations(texts):
    relations = []

    for text in texts:
        doc = nlp(text)

        # Extract named entities
        entities = {ent.text: ent.label_ for ent in doc.ents}

        # Extract subject-verb-object triples
        for sent in doc.sents:
            for token in sent:
                if token.dep_ == "nsubj" and token.head.pos_ == "VERB":
                    subject = token.text
                    verb = token.head.text

                    # Find objects of the verb
                    for child in token.head.children:
                        if child.dep_ in ["dobj", "pobj", "attr"]:
                            obj = child.text

                            # Add the triple
                            relations.append({
                                "subject": subject,
                                "predicate": verb,
                                "object": obj,
                                "sentence": sent.text
                            })

        # Extract special relationships like founded_by, based_in
        for ent in doc.ents:
            if ent.label_ == "ORG":  # Organization
                org = ent.text

                # Look for founders
                if "found" in text.lower() or "establish" in text.lower() or "creat" in text.lower():
                    for person_ent in doc.ents:
                        if person_ent.label_ == "PERSON" and person_ent.text != org:
                            relations.append({
                                "subject": org,
                                "predicate": "founded_by",
                                "object": person_ent.text,
                                "sentence": text
                            })

                # Look for locations (headquarters)
                if "headquarter" in text.lower() or "based in" in text.lower():
                    for loc_ent in doc.ents:
                        if loc_ent.label_ in ["GPE", "LOC"] and loc_ent.text != org:
                            relations.append({
                                "subject": org,
                                "predicate": "headquartered_in",
                                "object": loc_ent.text,
                                "sentence": text
                            })

    return relations

# Extract relations from texts
relations = extract_relations(texts)

# Create a knowledge graph
G = nx.DiGraph()

# Add edges to the graph
for relation in relations:
    G.add_edge(relation["subject"], relation["object"], label=relation["predicate"])

# Display the relations
df = pd.DataFrame(relations)
print(df)

# Visualize the knowledge graph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, seed=42)


# Draw nodes and edges
nx.draw(G, pos, with_labels=True, node_color="skyblue", node_size=2000, font_size=10, font_weight="bold")

# Draw edge labels
edge_labels = {(u, v): d["label"] for u, v, d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)

plt.title("Knowledge Graph of Tech Companies")
plt.axis("off")
plt.tight_layout()
plt.show()

## Semantic Role Labeling

Semantic Role Labeling (SRL) identifies the semantic relationships between predicates and arguments in sentences.

In [None]:
import nltk
from nltk.corpus import framenet as fn
import spacy
from allennlp.predictors.predictor import Predictor

# Download necessary resources
nltk.download('framenet_v17')

# Load AllenNLP SRL predictor
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")

# Example sentences
sentences = [
    "The chef cooked the pasta for dinner.",
    "The scientist discovered a new species in the rainforest.",
    "The company donated $5,000 to the local charity."
]

# Apply SRL to each sentence
for sentence in sentences:
    result = predictor.predict(sentence=sentence)
    print(f"Sentence: {sentence}")

    # Print the verb and its arguments
    verbs = result.get("verbs", [])
    for verb_info in verbs:
        print(f"Verb: {verb_info['verb']}")
        print(f"Tagged: {verb_info['description']}")
    print()

## Emotion Detection


Emotion detection identifies emotional states expressed in text

In [None]:
from transformers import pipeline
import pandas as pd
import matplotlib.pyplot as plt

# Load emotion classifier
emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")

# Example sentences with different emotions
sentences = [
    "I'm so happy to see you after all these years!",
    "I'm completely devastated by the news of the accident.",
    "The constant delays at the airport made me furious.",
    "I was terrified when I heard the strange noise at night.",
    "I'm feeling anxious about the upcoming presentation.",
    "The movie left me surprised with its unexpected twist.",
    "I feel disgusted by the way they treated their employees."
]

# Classify emotions
results = []
for sentence in sentences:
    emotion = emotion_classifier(sentence)[0]
    results.append({
        "text": sentence,
        "emotion": emotion["label"],
        "score": emotion["score"]
    })

# Create DataFrame
df = pd.DataFrame(results)
print(df)

# Visualize results
plt.figure(figsize=(10, 6))
colors = plt.cm.tab10(range(len(sentences)))

df_sorted = df.sort_values("score", ascending=False)
plt.barh(df_sorted["text"], df_sorted["score"], color=colors)
plt.xlabel("Confidence Score")
plt.ylabel("Text")
plt.title("Emotion Detection Results")

# Add emotion labels to bars
for i, emotion in enumerate(df_sorted["emotion"]):
    plt.text(0.01, i, f" {emotion}", va="center", fontweight="bold")

plt.tight_layout()
plt.show()

## Aspect-Based Sentiment Analysis with Fine-Tuning


Here's how to fine-tune a model for aspect-based sentiment analysis

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import numpy as np
import torch
from datasets import Dataset

# Create synthetic dataset for aspect-based sentiment analysis
data = [
    {"text": "The food was delicious but the service was slow.",
     "aspect": "food", "sentiment": "positive"},
    {"text": "The food was delicious but the service was slow.",
     "aspect": "service", "sentiment": "negative"},
    {"text": "The camera quality is excellent but the battery life is terrible.",
     "aspect": "camera", "sentiment": "positive"},
    {"text": "The camera quality is excellent but the battery life is terrible.",
     "aspect": "battery", "sentiment": "negative"},
    {"text": "The room was spacious but the Wi-Fi connection was unstable.",
     "aspect": "room", "sentiment": "positive"},
    {"text": "The room was spacious but the Wi-Fi connection was unstable.",
     "aspect": "Wi-Fi", "sentiment": "negative"},
    # Add more examples here
]

# Convert to DataFrame
df = pd.DataFrame(data)

# Create labels
sentiment_map = {"positive": 0, "neutral": 1, "negative": 2}
df["label"] = df["sentiment"].map(sentiment_map)

# Prepare input format: "text [SEP] aspect"
df["input_text"] = df["text"] + " [SEP] " + df["aspect"]

# Split data
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Load tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./absa_model",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

# Fine-tune model
trainer.train()

# Test on new examples
new_examples = [
    {"text": "The screen resolution is amazing but the price is too high.", "aspect": "screen"},
    {"text": "The screen resolution is amazing but the price is too high.", "aspect": "price"},
    {"text": "The hotel location was perfect for sightseeing but the breakfast was mediocre.", "aspect": "location"},
    {"text": "The hotel location was perfect for sightseeing but the breakfast was mediocre.", "aspect": "breakfast"}
]

# Process new examples
for ex in new_examples:
    input_text = ex["text"] + " [SEP] " + ex["aspect"]
    encoded_input = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**encoded_input)
        predictions = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()

    sentiment_labels = {0: "positive", 1: "neutral", 2: "negative"}
    print(f"Text: {ex['text']}")
    print(f"Aspect: {ex['aspect']}")
    print(f"Predicted sentiment: {sentiment_labels[predicted_class]}")
    print(f"Confidence: {predictions[0][predicted_class].item():.4f}")
    print()

## Text Clustering

Text clustering groups similar documents together based on their content

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Example document collection
documents = [
    # Technology
    "Machine learning models require large amounts of training data.",
    "Neural networks have revolutionized computer vision tasks.",
    "Cloud computing enables scalable and flexible IT infrastructure.",
    "Quantum computing promises to solve complex problems efficiently.",
    "Blockchain technology ensures transparent and secure transactions.",

    # Health
    "Regular exercise reduces the risk of cardiovascular disease.",
    "A balanced diet is essential for maintaining good health.",
    "Adequate sleep is crucial for mental and physical wellbeing.",
    "Vaccines help prevent the spread of infectious diseases.",
    "Stress management techniques improve overall health outcomes.",

    # Environment
    "Renewable energy sources reduce carbon emissions significantly.",
    "Climate change poses serious threats to global ecosystems.",
    "Sustainable agriculture practices preserve soil quality.",
    "Ocean pollution endangers marine biodiversity worldwide.",
    "Deforestation contributes to habitat loss and species extinction."
]

# Extract features using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(documents)

# Apply K-Means clustering
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(X)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=2)
dbscan_labels = dbscan.fit_predict(X)

# Dimensionality reduction for visualization
pca = PCA(n_components=2)
X_dense = X.toarray()
X_pca = pca.fit_transform(X_dense)

# t-SNE for better visualization
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_dense)

# Create DataFrame with results
df = pd.DataFrame({
    'document': documents,
    'kmeans_cluster': kmeans_labels,
    'dbscan_cluster': dbscan_labels,
    'pca_x': X_pca[:, 0],
    'pca_y': X_pca[:, 1],
    'tsne_x': X_tsne[:, 0],
    'tsne_y': X_tsne[:, 1]
})

# Print cluster assignments
print("K-Means Clustering Results:")
for cluster in range(n_clusters):
    print(f"\nCluster {cluster}:")
    cluster_docs = df[df['kmeans_cluster'] == cluster]['document'].values
    for doc in cluster_docs:
        print(f"- {doc}")

# Visualize clusters with PCA
plt.figure(figsize=(12, 5))

# K-Means with PCA
plt.subplot(1, 2, 1)
for cluster in range(n_clusters):
    cluster_data = df[df['kmeans_cluster'] == cluster]
    plt.scatter(cluster_data['pca_x'], cluster_data['pca_y'], label=f'Cluster {cluster}')

plt.title('Document Clustering with K-Means and PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()

# K-Means with t-SNE
plt.subplot(1, 2, 2)
for cluster in range(n_clusters):
    cluster_data = df[df['kmeans_cluster'] == cluster]
    plt.scatter(cluster_data['tsne_x'], cluster_data['tsne_y'], label=f'Cluster {cluster}')

plt.title('Document Clustering with K-Means and t-SNE')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()

plt.tight_layout()
plt.show()

## Text Coherence Evaluation

Text coherence evaluation assesses how well the sentences in a text connect and flow together.


In [None]:
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib.pyplot as plt

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Example texts with different coherence levels
texts = [
    # High coherence
    """Climate change is a major global challenge. Rising temperatures are causing ice caps to melt.
    This melting leads to rising sea levels, which threaten coastal communities worldwide.
    Scientists agree that reducing carbon emissions is essential to address this issue.""",

    # Medium coherence
    """Climate change is a major global challenge. Many species of birds migrate south for winter.
    Rising sea levels threaten coastal communities worldwide.
    Scientists agree that reducing carbon emissions is essential.""",

    # Low coherence
    """Climate change is a major global challenge. The new restaurant opened downtown last week.
    The basketball team won the championship.
    Scientists agree that reducing carbon emissions is essential."""
]

def evaluate_coherence(text):
    # Parse text
    doc = nlp(text)

    # Split into sentences
    sentences = list(doc.sents)

    if len(sentences) < 2:
        return {"coherence_score": 1.0, "sentence_similarities": []}

    # Get sentence embeddings
    embeddings = [sent.vector for sent in sentences]

    # Calculate pairwise similarities between adjacent sentences
    similarities = []
    for i in range(len(embeddings) - 1):
        sim = cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0]
        similarities.append(sim)

    # Calculate overall coherence score (average of similarities)
    coherence_score = np.mean(similarities)

    return {
        "coherence_score": coherence_score,
        "sentence_similarities": similarities
    }

# Evaluate each text
results = []
for i, text in enumerate(texts):
    coherence_data = evaluate_coherence(text)

    # Add to results
    results.append({
        "text_id": i+1,
        "coherence_level": ["Low", "Medium", "High"][i],  # For demonstration purposes
        "text_preview": text[:50] + "...",
        "coherence_score": coherence_data["coherence_score"],
        "sentence_similarities": coherence_data["sentence_similarities"]
    })

# Create DataFrame
df = pd.DataFrame(results)
print(df[["text_id", "coherence_level", "text_preview", "coherence_score"]])

# Visualize coherence scores
plt.figure(figsize=(10, 6))
plt.bar(df["coherence_level"], df["coherence_score"], color=["red", "yellow", "green"])
plt.title("Text Coherence Scores")
plt.xlabel("Coherence Level")
plt.ylabel("Score")
plt.ylim(0, 1)

for i, score in enumerate(df["coherence_score"]):
    plt.text(i, score + 0.05, f"{score:.2f}", ha='center')

plt.tight_layout()
plt.show()

## Explainable NLP with LIME

Using LIME (Local Interpretable Model-agnostic Explanations) to explain text classification decisions:

In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from lime.lime_text import LimeTextExplainer
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt

# Load data
categories = ['alt.atheism', 'soc.religion.christian']
train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

# Train classifier
pipeline = make_pipeline(
    TfidfVectorizer(max_features=1000),
    LogisticRegression(random_state=42)
)
pipeline.fit(train.data, train.target)

# Create explainer
explainer = LimeTextExplainer(class_names=categories)

# Get test instance
idx = 1
test_instance = test.data[idx]
true_class = test.target[idx]
true_class_name = categories[true_class]

# Make prediction
pred_probas = pipeline.predict_proba([test_instance])[0]
pred_class = np.argmax(pred_probas)
pred_class_name = categories[pred_class]
confidence = pred_probas[pred_class]

print(f"Document: {test_instance[:300]}...")
print(f"True class: {true_class_name}")
print(f"Predicted class: {pred_class_name} with confidence {confidence:.4f}")

# Explain prediction
exp = explainer.explain_instance(test_instance, pipeline.predict_proba, num_features=10)

# Display explanation
print("\nExplanation:")
for feature, weight in exp.as_list():
    print(f"{feature}: {weight:.4f}")

# Plot explanation
plt.figure(figsize=(10, 6))
exp.as_pyplot_figure()
plt.title(f"LIME Explanation\nPrediction: {pred_class_name} (conf: {confidence:.2f})")
plt.tight_layout()
plt.show()

## Document Similarity with Sentence Transformers

Using Sentence Transformers for robust document similarity

In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example documents
documents = [
    # Technology
    "Machine learning models require large amounts of training data.",
    "Neural networks have revolutionized computer vision tasks.",
    "Deep learning algorithms use multiple layers of neural networks.",

    # Health
    "Regular exercise reduces the risk of cardiovascular disease.",
    "A balanced diet is essential for maintaining good health.",
    "Physical activity improves overall health and wellbeing.",

    # Finance
    "Stock market investors analyze company performance and trends.",
    "Investment portfolios should be diversified to reduce risk.",
    "Financial planning involves budgeting and saving for future goals."
]

# Calculate embeddings
embeddings = model.encode(documents)

# Calculate similarity matrix
similarity_matrix = util.cos_sim(embeddings, embeddings)

# Convert to numpy array for easier manipulation
similarity_matrix = similarity_matrix.numpy()

# Create DataFrame for visualization
df_similarity = pd.DataFrame(similarity_matrix,
                             index=[f"Doc {i+1}" for i in range(len(documents))],
                             columns=[f"Doc {i+1}" for i in range(len(documents))])

# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df_similarity, annot=True, cmap='YlGnBu', vmin=0, vmax=1)
plt.title('Document Similarity Matrix (Sentence Transformers)')
plt.tight_layout()
plt.show()

# Find most similar document pairs
np.fill_diagonal(similarity_matrix, 0)  # Exclude self-similarity
for i in range(len(documents)):
    max_sim_idx = np.argmax(similarity_matrix[i])
    if i < max_sim_idx:  # Avoid duplicate pairs
        print(f"Similar pair found:")
        print(f"Document {i+1}: {documents[i]}")
        print(f"Document {max_sim_idx+1}: {documents[max_sim_idx]}")
        print(f"Similarity: {similarity_matrix[i][max_sim_idx]:.4f}")
        print()

## Bias Detection in NLP Models

Detecting and measuring bias in NLP models

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

# Template sentences with masked tokens
templates = [
    "The {gender} works as a {profession}.",
    "{gender} are known to be {trait}.",
    "The {gender} enjoys {activity} in their free time."
]

# Word pairs to test
gender_terms = [("man", "woman"), ("boy", "girl"), ("father", "mother"), ("he", "she")]
professions = ["doctor", "nurse", "engineer", "teacher", "scientist", "assistant"]
traits = ["intelligent", "emotional", "rational", "ambitious", "caring", "aggressive"]
activities = ["sports", "shopping", "reading", "cooking", "gaming", "art"]

def get_mask_predictions(template, replacements):
    results = []

    for gender_pair in gender_terms:
        row = {"gender_pair": f"{gender_pair[0]}/{gender_pair[1]}"}

        for term in replacements:
            # Create sentences with mask token
            male_sent = template.format(gender=gender_pair[0], **{template.split("{")[2].split("}")[0]: "[MASK]"})
            female_sent = template.format(gender=gender_pair[1], **{template.split("{")[2].split("}")[0]: "[MASK]"})

            # Replace the term placeholder with [MASK]
            male_sent = male_sent.replace("[MASK]", term)
            female_sent = female_sent.replace("[MASK]", term)

            # Get token IDs for male sentence
            male_inputs = tokenizer(male_sent, return_tensors="pt")
            male_token_ids = male_inputs["input_ids"]
            male_outputs = model(**male_inputs)
            male_predictions = male_outputs.logits

            # Get token IDs for female sentence
            female_inputs = tokenizer(female_sent, return_tensors="pt")
            female_token_ids = female_inputs["input_ids"]
            female_outputs = model(**female_inputs)
            female_predictions = female_outputs.logits

            # Get probabilities
            male_probs = torch.softmax(male_predictions[0], dim=-1)
            female_probs = torch.softmax(female_predictions[0], dim=-1)

            # Get probability for the target term
            term_id = tokenizer.encode(term, add_special_tokens=False)[0]
            male_prob = male_probs[male_token_ids[0] == term_id, term_id].item()
            female_prob = female_probs[female_token_ids[0] == term_id, term_id].item()

            # Calculate bias score (difference in probabilities)
            bias_score = male_prob - female_prob

            # Add to results
            row[term] = bias_score

        results.append(row)

    return pd.DataFrame(results)

# Analyze profession bias
template = templates[0]
profession_bias = get_mask_predictions(template, professions)
print("Profession Bias:")
print(profession_bias)

# Analyze trait bias
template = templates[1]
trait_bias = get_mask_predictions(template, traits)
print("\nTrait Bias:")
print(trait_bias)

# Visualize profession bias
plt.figure(figsize=(12, 8))
data = profession_bias.melt(id_vars=["gender_pair"], var_name="profession", value_name="bias")
sns.barplot(x="profession", y="bias", hue="gender_pair", data=data)
plt.title("Gender Bias in Profession Associations")
plt.xlabel("Profession")
plt.ylabel("Bias Score (positive = male bias, negative = female bias)")
plt.axhline(y=0, color='r', linestyle='-', alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Question Generation

Automatically generating questions from text passages

from transformers import T5ForConditionalGeneration, T5Tokenizer
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd

nltk.download('punkt')

# Load model and tokenizer
model_name = "mrm8488/t5-base-finetuned-question-generation-ap"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Sample passages
passages = [
    """Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction
    between computers and humans through natural language. The ultimate goal of NLP is to enable computers
    to understand, interpret and generate human language in a valuable way.""",
    
    """The solar system consists of the Sun and everything that orbits around it, including planets,
    moons, asteroids, comets and meteoroids. There are eight planets in our solar system: Mercury, Venus,
    Earth, Mars, Jupiter, Saturn, Uranus and Neptune."""
]

def generate_questions(passage, num_questions=3):
    # Split into sentences
    sentences = sent_tokenize(passage)
    
    questions = []
    for sentence in sentences[:num_questions]:  # Limit to avoid too many questions
        # Prepare input for T5
        input_text = f"generate question: {sentence}"
        
        # Tokenize input
        encoding = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        
        # Generate question
        outputs = model.generate(
            encoding.input_ids,
            max_length=64,
            num_beams=4,
            early_stopping=True
        )
        
        # Decode output
        question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        questions.append({
            "context": sentence,
            "generated_question": question
        })
    
    return questions

# Generate questions for each passage
all_questions = []
for i, passage in enumerate(passages):
    print(f"\nPassage {i+1}:")
    print(passage)
    print("\nGenerated Questions:")
    
    questions = generate_questions(passage)
    for q in questions:
        print(f"Context: {q['context']}")
        print(f"Question: {q['generated_question']}")
        print()
        
        # Add to collection
        q["passage_id"] = i + 1
        all_questions.append(q)

# Display as DataFrame
df_questions = pd.DataFrame(all_questions)
print(df_questions)