In [None]:
print("Soham")

Soham


In [None]:
!pip install spacy rapidfuzz nltk scikit-learn



In [None]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import re
import spacy
import nltk
from collections import Counter
from rapidfuzz.distance import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer

# Initialize SpaCy model
nlp = spacy.load("en_core_web_sm")

# Download NLTK tokenizer
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
SAMPLE_TEXT = (
    "Hey @john_doe, check out https://example.com/event on 15-08-2025!\n"
    "Call me at +1-202-555-0183 or email jane.doe1990@mail.org ASAP.\n"
    "This product is damn awful!! #fail #WorstDayEver\n"
    "Big news: OpenAI opens new office in Paris. 🚀"
)


In [None]:
class InfoExtractor:
    def __init__(self, text):
        self.text = text

    def extract_urls(self):
        return re.findall(r'https?://\S+', self.text)

    def extract_dates(self):
        return re.findall(r'\b\d{2}-\d{2}-\d{4}\b', self.text)

    def extract_emails(self):
        return re.findall(r'\b[\w.-]+@[\w.-]+\.\w+\b', self.text)

    def extract_phone_numbers(self):
        return re.findall(r'\+?\d{1,3}-\d{3}-\d{3}-\d{4}', self.text)

    def extract_mentions(self):
        return re.findall(r'@\w+', self.text)

    def extract_hashtags(self):
        return re.findall(r'#\w+', self.text)

    def extract_offensive_words(self):
        offensive_list = ['damn', 'awful']  # Add more as needed
        return [word for word in self.text.lower().split() if word.strip('!.,') in offensive_list]


In [None]:
extractor = InfoExtractor(SAMPLE_TEXT)
print("✅ URLs:", extractor.extract_urls())
print("✅ Dates:", extractor.extract_dates())
print("✅ Emails:", extractor.extract_emails())
print("✅ Phone Numbers:", extractor.extract_phone_numbers())
print("✅ Mentions:", extractor.extract_mentions())
print("✅ Hashtags:", extractor.extract_hashtags())
print("✅ Offensive Words:", extractor.extract_offensive_words())


✅ URLs: ['https://example.com/event']
✅ Dates: ['15-08-2025']
✅ Emails: ['jane.doe1990@mail.org']
✅ Phone Numbers: ['+1-202-555-0183']
✅ Mentions: ['@john_doe', '@mail']
✅ Hashtags: ['#fail', '#WorstDayEver']
✅ Offensive Words: ['damn', 'awful!!']


In [None]:
class NgramModel:
    def __init__(self, text, n):
        self.n = n
        self.tokens = nltk.word_tokenize(text.lower())
        self.ngrams = list(nltk.ngrams(self.tokens, n))
        self.freq_dist = Counter(self.ngrams)

    def get_ngram_count(self, ngram):
        return self.freq_dist[ngram]

    def get_mle_probability(self, ngram):
        prefix = ngram[:-1]
        prefix_count = sum(1 for ng in self.ngrams if ng[:-1] == prefix)
        return self.freq_dist[ngram] / prefix_count if prefix_count > 0 else 0


In [None]:
import nltk
nltk.download('punkt_tab')

uni = NgramModel(SAMPLE_TEXT, 1)
bi = NgramModel(SAMPLE_TEXT, 2)
tri = NgramModel(SAMPLE_TEXT, 3)

print("Unigram count of ('the',):", uni.get_ngram_count(('the',)))
print("Bigram count of ('check', 'out'):", bi.get_ngram_count(('check', 'out')))
print("Trigram MLE P(('check', 'out', 'https')):", tri.get_mle_probability(('check', 'out', 'https')))


Unigram count of ('the',): 0
Bigram count of ('check', 'out'): 1
Trigram MLE P(('check', 'out', 'https')): 1.0


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def compute_levenshtein(str1, str2):
    return Levenshtein.distance(str1, str2)

# 🧪 Example
post1 = "Hey @john_doe, check out the event!"
post2 = "Hey @john_doe, check the event!"

print("🔁 Levenshtein Distance:", compute_levenshtein(post1, post2))


🔁 Levenshtein Distance: 4


In [None]:
class LinguisticAnalyzer:
    def __init__(self, text):
        self.doc = nlp(text)

    def get_noun_chunks(self):
        return [chunk.text for chunk in self.doc.noun_chunks]

    def get_verbs(self):
        return [token.text for token in self.doc if token.pos_ == "VERB"]

    def get_named_entities(self):
        return [(ent.text, ent.label_) for ent in self.doc.ents]


In [None]:
analyzer = LinguisticAnalyzer(SAMPLE_TEXT)

print("📌 Noun Chunks:", analyzer.get_noun_chunks())
print("📌 Verbs:", analyzer.get_verbs())
print("📌 Named Entities (NER):", analyzer.get_named_entities())


📌 Noun Chunks: ['me', '+1-202-555-0183 or email jane.doe1990@mail.org ASAP', 'This product', '#WorstDayEver\nBig news', 'OpenAI', 'new office', 'Paris']
📌 Verbs: ['check', 'Call', 'email', 'fail', 'opens']
📌 Named Entities (NER): [('15-08-2025', 'DATE'), ('jane.doe1990@mail.org ASAP', 'PERSON'), ('#fail #', 'MONEY'), ('Paris', 'GPE'), ('🚀', 'CARDINAL')]


In [None]:
print("\n=== Summary ===")
print("✅ URLs:", extractor.extract_urls())
print("✅ Emails:", extractor.extract_emails())
print("✅ Phone Numbers:", extractor.extract_phone_numbers())
print("✅ Hashtags:", extractor.extract_hashtags())
print("✅ NER:", analyzer.get_named_entities())
print("✅ Offensive:", extractor.extract_offensive_words())
print("✅ Levenshtein ('event', 'event!'):", compute_levenshtein("event", "event!"))



=== Summary ===
✅ URLs: ['https://example.com/event']
✅ Emails: ['jane.doe1990@mail.org']
✅ Phone Numbers: ['+1-202-555-0183']
✅ Hashtags: ['#fail', '#WorstDayEver']
✅ NER: [('15-08-2025', 'DATE'), ('jane.doe1990@mail.org ASAP', 'PERSON'), ('#fail #', 'MONEY'), ('Paris', 'GPE'), ('🚀', 'CARDINAL')]
✅ Offensive: ['damn', 'awful!!']
✅ Levenshtein ('event', 'event!'): 1
