## 1. Language Modeling with N-grams

In [6]:
import random
from nltk import download
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import gutenberg

In [8]:
download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
# Download Shakespeare texts from NLTK
download('gutenberg')
shakespeare_texts = ['shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt']
shakespeare_corpus = [gutenberg.words(text) for text in shakespeare_texts]
shakespeare_training_text = ' '.join([' '.join(text) for text in shakespeare_corpus])

# Tokenize the training text
tokens = word_tokenize(shakespeare_training_text)


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [10]:
# Function to generate n-grams
def generate_ngrams(text, n):
    ngrams_list = ngrams(text, n)
    return list(ngrams_list)


In [45]:
# Generate trigrams from the training text
trigrams = generate_ngrams(tokens, 3)


In [46]:
# Create a frequency distribution of trigrams
freq_dist = FreqDist(trigrams)
freq_dist

FreqDist({("'", 'd', ','): 115, ('.', 'Exeunt', '.'): 71, ('?', 'Ham', '.'): 62, ('my', 'Lord', ','): 60, (',', 'my', 'Lord'): 59, ("'", 'th', "'"): 55, ('Lord', 'Ham', '.'): 44, (',', 'and', 'the'): 43, ('Ham', '.', 'I'): 42, ('my', 'Lord', 'Ham'): 40, ...})

In [48]:
# Sample testing text
testing_text = "To be or not to be, that is the question."

# Tokenize the testing text
test_tokens = word_tokenize(testing_text)
test_tokens

['To',
 'be',
 'or',
 'not',
 'to',
 'be',
 ',',
 'that',
 'is',
 'the',
 'question',
 '.']

In [49]:
# Function to predict the next word using the trained N-gram model
def predict_next_word(context, n):
    context_ngram = tuple(context[-n:])
    possible_next_words = [ngram[-1] for ngram in freq_dist if ngram[:-1] == context_ngram]
    if not possible_next_words:
        return None
    return random.choice(possible_next_words)


In [55]:
# Test the N-gram model
context = ['To', 'your']  # Using the first two words as context
ngram_order = 3  # Using trigrams

for _ in range(5):  # Generate 5 words
    next_word = predict_next_word(context, ngram_order)
    if next_word:
        print(" ".join(context), "->", next_word)
        context.append(next_word)
    else:
        break

To your -> proceeding


## 2. Text Classification using Naive Bayes & Logistic Regression

In [61]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [62]:
# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))


In [64]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

# Convert the raw text into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)


In [59]:
# Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_counts, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_counts)


In [60]:
# Evaluate the performance of the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.62


In [66]:
# Train the Logistic Regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_counts, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_counts)

In [67]:
# Evaluate the performance of the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.67


## 3. TF-IDF for Information Retrieval

In [68]:
import nltk
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize

In [69]:
# Download the Reuters corpus from nltk
nltk.download('reuters')

# Load Reuters news articles as documents
documents = [reuters.raw(file_id) for file_id in reuters.fileids()]

# Tokenize the documents into sentences
tokenized_documents = [sent_tokenize(doc) for doc in documents]

# Flatten the list of sentences to create chunks
chunks = [sentence for doc in tokenized_documents for sentence in doc]


[nltk_data] Downloading package reuters to /root/nltk_data...


In [70]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(chunks)

In [71]:
def retrieve_most_similar(input_text, tfidf_matrix, tfidf_vectorizer, documents):
    # Transform the input text using the same vectorizer
    input_vector = tfidf_vectorizer.transform([input_text])

    # Calculate cosine similarity between the input vector and the document vectors
    similarity_scores = cosine_similarity(input_vector, tfidf_matrix)

    # Get the index of the most similar chunk
    most_similar_index = similarity_scores.argmax()

    # Retrieve the most similar chunk of text from the documents
    most_similar_text = documents[most_similar_index]

    return most_similar_text


In [72]:
# Example input text
input_text = "Economic growth and trade policies impact global markets."

# Retrieve the most similar chunk of text
result = retrieve_most_similar(input_text, tfidf_matrix, tfidf_vectorizer, chunks)

# Print the result
print("Input Text:")
print(input_text)
print("\nMost Similar Chunk:")
print(result)

Input Text:
Economic growth and trade policies impact global markets.

Most Similar Chunk:
Conable said the World Bank has been pressing developing
  countries to open their markets, arguing that a free trading
  environment increased the possibility of global economic
  growth.


## 3. CRF for Named Entity Recognition

In [74]:
pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn-crfsuite-0.3.6


In [75]:
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [76]:
# Download the CoNLL 2003 dataset for NER
nltk.download('conll2002')

# Load the CoNLL 2002 dataset
from nltk.corpus import conll2002


[nltk_data] Downloading package conll2002 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2002.zip.


In [77]:
# Function to extract features from a sentence
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features


In [82]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for word, pos, label in sent]


In [83]:
# Load the CoNLL 2002 dataset
train_sents = list(conll2002.iob_sents('esp.train'))
test_sents = list(conll2002.iob_sents('esp.testb'))

# Extract features and labels
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]


In [None]:
# Train the CRF model
crf = sklearn_crfsuite.CRF()
crf.fit(X_train, y_train)


In [93]:
# Make predictions on the test set
y_pred = crf.predict(X_test)

# Flatten the true labels and predicted labels
y_test_flat = [label for sent_labels in y_test for label in sent_labels]
y_pred_flat = [label for sent_labels in y_pred for label in sent_labels]

# Generate and print classification report
report = classification_report(y_test_flat, y_pred_flat)
print(report)

              precision    recall  f1-score   support

       B-LOC       0.79      0.76      0.77      1084
      B-MISC       0.71      0.47      0.57       339
       B-ORG       0.79      0.81      0.80      1400
       B-PER       0.82      0.84      0.83       735
       I-LOC       0.68      0.64      0.66       325
      I-MISC       0.62      0.55      0.58       557
       I-ORG       0.83      0.79      0.81      1104
       I-PER       0.89      0.94      0.92       634
           O       0.99      1.00      0.99     45355

    accuracy                           0.97     51533
   macro avg       0.79      0.75      0.77     51533
weighted avg       0.97      0.97      0.97     51533

