In [1]:
pip install nltk



In [2]:
import nltk
nltk.download('brown')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [3]:
from nltk.corpus import brown
news_text = brown.words(categories='news')
print(news_text)


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]


In [4]:
print(type(news_text))

<class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>


In [5]:
import re
from nltk.corpus.reader.util import ConcatenatedCorpusView


if isinstance(news_text, ConcatenatedCorpusView):

    news_text_string = ' '.join(news_text)
else:
    print("news_text is not a ConcatenatedCorpusView.")

In [6]:
news_text_string



Q1 . Regular Expressions

In [7]:
import re

text = """
Hi there,
I wanted to reach out regarding our upcoming project on land restoration. You can contact me anytime at Earthrenewal.AI@gmail.com for more details. Additionally, feel free to share any documents or resources related to soil erosion by sending them to the same email address.
Best regards,
Nizamudin
"""


email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
emails = re.findall(email_pattern, text)


print(emails)


['Earthrenewal.AI@gmail.com']


In [8]:
import re


text = """
Hi there,
I wanted to reach out regarding our upcoming project on land restoration. You can contact me anytime at Earthrenewal.AI@gmail.com or call me at 03133362105 for more details. Additionally, you can visit our website at Earthrenewal.com for more information. Feel free to share any documents or resources related to soil erosion by sending them to the provided email address.
Best regards,
Nizamudin
"""


phone_pattern = r'\b\d{11}\b'
phones = re.findall(phone_pattern, text)


print(phones)


['03133362105']


In [9]:

url_pattern = r'\b(?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,6}\b'
urls = re.findall(url_pattern, text)

print(urls)


['Earthrenewal.AI', 'gmail.com', 'Earthrenewal.com']


In [10]:
import re


text = """
Hi there,
I wanted to reach out regarding our upcoming project on land restoration. You can contact me anytime at Earthrenewal.AI@gmail.com or call me at 03133362105 for more details. Additionally, you can visit our website at Earthrenewal.com for more information. The project is expected to start on 15/09/2024 and should be completed by 12/12/2024. We also had a meeting on March 3, 2024, which was very productive.
Best regards,
Nizamudin
"""


date_pattern = r'\b\d{1,2}/\d{1,2}/\d{4}\b|\b\w+\s\d{1,2},\s\d{4}\b'


dates = re.findall(date_pattern, text)


print(dates)


['15/09/2024', '12/12/2024', 'March 3, 2024']


In [12]:
import re


text = """
Hi there,
I wanted to reach out regarding our upcoming project on land restoration. You can contact me anytime at Earthrenewal.AI@gmail.com or call me at 03133362105 for more details. Additionally, you can visit our website at Earthrenewal.com for more information. Don't forget to follow our updates on #EarthRestoration and #SoilHealth. Best regards, Nizamudin
"""


url_pattern = r'\b(?:https?://)?(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,6}\b'


text_with_placeholder = re.sub(url_pattern, '[URL]', text)

print(text_with_placeholder)



Hi there,
I wanted to reach out regarding our upcoming project on land restoration. You can contact me anytime at [URL]@[URL] or call me at 03133362105 for more details. Additionally, you can visit our website at [URL] for more information. Don't forget to follow our updates on #EarthRestoration and #SoilHealth. Best regards, Nizamudin



In [13]:

split_text = re.split(r'[,\.\n]+', text)


split_text = [s.strip() for s in split_text if s.strip()]

print(split_text)


['Hi there', 'I wanted to reach out regarding our upcoming project on land restoration', 'You can contact me anytime at Earthrenewal', 'AI@gmail', 'com or call me at 03133362105 for more details', 'Additionally', 'you can visit our website at Earthrenewal', 'com for more information', "Don't forget to follow our updates on #EarthRestoration and #SoilHealth", 'Best regards', 'Nizamudin']


In [14]:

email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'


emails = re.findall(email_pattern, text)


print(emails)


['Earthrenewal.AI@gmail.com']


In [15]:

hashtag_pattern = r'#[A-Za-z0-9_]+'


hashtags = re.findall(hashtag_pattern, text)


print(hashtags)


['#EarthRestoration', '#SoilHealth']


Q.2 Text Normalization

In [16]:
news_text_string



In [17]:
lowercase_text = news_text_string.lower()

print(lowercase_text)



In [18]:
import string


text_no_punctuation = lowercase_text.translate(str.maketrans('', '', string.punctuation))

print(text_no_punctuation)




In [19]:
tokens = text_no_punctuation.split()

print(tokens)




Q.3 Edit Distance

In [20]:
pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.25.1 (from python-Levenshtein)
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein==0.25.1->python-Levenshtein)
  Downloading rapidfuzz-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: r

In [21]:
import Levenshtein


s1 = "kitten"
s2 = "sitting"


distance = Levenshtein.distance(s1, s2)

print(f"The Levenshtein distance between '{s1}' and '{s2}' is {distance}.")


The Levenshtein distance between 'kitten' and 'sitting' is 3.


Q.4 N-gram Language Models and Smoothing

In [22]:
from collections import defaultdict

def generate_ngrams(text, n):

    words = text.split()


    ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)]

    return ngrams


text = "I love natural language processing and AI."
bigrams = generate_ngrams(text, 2)
print("Bigrams:", bigrams)

trigrams = generate_ngrams(text, 3)
print("Trigrams:", trigrams)


Bigrams: [('I', 'love'), ('love', 'natural'), ('natural', 'language'), ('language', 'processing'), ('processing', 'and'), ('and', 'AI.')]
Trigrams: [('I', 'love', 'natural'), ('love', 'natural', 'language'), ('natural', 'language', 'processing'), ('language', 'processing', 'and'), ('processing', 'and', 'AI.')]


In [23]:
class NgramLanguageModel:
    def __init__(self, n):
        self.n = n
        self.ngrams = defaultdict(int)
        self.context_counts = defaultdict(int)
        self.vocabulary = set()

    def train(self, text):

        ngrams = generate_ngrams(text, self.n)

        for ngram in ngrams:
            self.ngrams[ngram] += 1
            context = ngram[:-1]
            self.context_counts[context] += 1
            self.vocabulary.update(ngram)

    def probability(self, ngram):
        context = ngram[:-1]
        count_ngram = self.ngrams[ngram] + 1
        count_context = self.context_counts[context] + len(self.vocabulary)

        return count_ngram / count_context

def sentence_probability(model, sentence):
    ngrams = generate_ngrams(sentence, model.n)
    prob = 1.0

    for ngram in ngrams:
        prob *= model.probability(ngram)

    return prob


text = "I love natural language processing and AI."
model = NgramLanguageModel(n=2)
model.train(text)

sentence = "I love AI."
prob = sentence_probability(model, sentence)
print(f"Probability of the sentence '{sentence}': {prob}")


Probability of the sentence 'I love AI.': 0.03125
