<a href="https://colab.research.google.com/github/Chanda-Bhavesh/NLP-Assignment/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1) Correct the Search Query
import re
import json
import zlib
from difflib import get_close_matches

def load_corpus():
    """Loads a prebuilt word corpus and country names."""

    word_list = [
        "going", "to", "china", "who", "was", "the", "first", "president", "of", "india", "winner", "match", "food", "in", "america"
    ]
    country_names = ["india", "china", "usa", "america"]


    corpus = set(word_list + country_names)


    compressed_corpus = zlib.compress(json.dumps(list(corpus)).encode())
    return compressed_corpus

def decompress_corpus(compressed_corpus):
    """Decompresses and loads the word corpus."""
    return set(json.loads(zlib.decompress(compressed_corpus).decode()))

def correct_word(word, corpus):
    """Corrects a single word using fuzzy matching."""
    matches = get_close_matches(word, corpus, n=1, cutoff=0.8)
    return matches[0] if matches else word

def segment_text(text, corpus):
    """Handles cases where spaces are missing between words."""
    for i in range(1, len(text)):
        left, right = text[:i], text[i:]
        if left in corpus and right in corpus:
            return f"{left} {right}"
    return text

def correct_query(query, corpus):
    """Corrects the entire query."""
    words = query.split()
    corrected_words = []

    for word in words:

        corrected_word = correct_word(word, corpus)


        if corrected_word == word:
            corrected_word = segment_text(word, corpus)
        corrected_words.append(corrected_word)

    return " ".join(corrected_words)

def main():

    compressed_corpus = load_corpus()
    corpus = decompress_corpus(compressed_corpus)


    n = int(input())
    queries = [input().strip() for _ in range(n)]


    results = [correct_query(query, corpus) for query in queries]


    print("\n".join(results))

if __name__ == "__main__":
    main()

In [None]:
# 2) Deterministic Url and HashTag Segmentation
import re

def load_words(file_path):
    """Load the list of words from the words.txt file."""
    try:
        with open(file_path, "r") as file:
            return set(word.strip().lower() for word in file.readlines())
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return set()

def segment_string(input_string, words):
    """Segment the input string into valid words based on the dictionary."""
    def helper(s):
        if not s:
            return []
        for i in range(len(s), 0, -1):
            prefix = s[:i]
            if prefix in words or re.match(r"^\d+(\.\d+)?$", prefix):
                rest = helper(s[i:])
                if rest is not None:
                    return [prefix] + rest
        return None

    result = helper(input_string)
    return result if result else [input_string]

def clean_input(input_string):
    """Clean the input string by removing www, extensions, or hashtags."""
    if input_string.startswith("www."):
        input_string = input_string[4:]

    input_string = re.sub(r"\.(com|net|org|edu|gov|us|ru|cn|tv|co.uk|co|io|in)$", "", input_string)

    if input_string.startswith("#"):
        input_string = input_string[1:]
    return input_string

def main():

    words = load_words("words.txt")

    if not words:
        return


    n = int(input())
    inputs = [input().strip() for _ in range(n)]


    results = []
    for input_string in inputs:
        cleaned_input = clean_input(input_string)
        segmented = segment_string(cleaned_input.lower(), words)
        results.append(" ".join(segmented))


    print("\n".join(results))

if __name__ == "__main__":
    main()

In [None]:
# 3) Disambiguation: Mouse vs Mouse
import re

def disambiguate_mouse(sentence):

    animal_keywords = ['tail', 'squeak', 'genome', 'postnatal', 'environmental', 'temperature', 'scurried', 'rodent']
    computer_mouse_keywords = ['input device', 'click', 'cursor', 'select', 'pointer', 'scroll', 'drag']


    sentence = sentence.lower()


    for keyword in animal_keywords:
        if keyword in sentence:
            return "animal"

    for keyword in computer_mouse_keywords:
        if keyword in sentence:
            return "computer-mouse"


    return "animal"

def main():

    n = int(input())


    for _ in range(n):
        sentence = input().strip()
        result = disambiguate_mouse(sentence)
        print(result)

if __name__ == "__main__":
    main()

In [None]:
# 4) Language Detection
import re
from collections import Counter

language_stopwords = {
    "English": ["the", "and", "is", "in", "it", "of", "to", "a", "that", "with"],
    "French": ["le", "la", "les", "et", "de", "un", "une", "à", "que", "pour"],
    "German": ["der", "die", "das", "und", "in", "zu", "mit", "von", "ist", "das"],
    "Spanish": ["el", "la", "los", "las", "y", "en", "de", "que", "con", "para"]
}

language_ngrams = {
    "English": ["th", "he", "in", "an", "nd", "on", "es", "is", "at", "or"],
    "French": ["le", "de", "on", "ent", "es", "que", "un", "re", "nt", "en"],
    "German": ["er", "en", "ch", "die", "der", "zu", "sch", "ist", "te", "und"],
    "Spanish": ["la", "el", "de", "en", "es", "que", "se", "un", "con", "los"]
}

def clean_text(text):

    return re.sub(r'[^\x00-\x7F]+', '', text).lower()

def get_ngrams(text, n=2):

    return [text[i:i+n] for i in range(len(text) - n + 1)]

def compare_ngrams(text_ngrams, language_ngrams):

    text_ngram_count = Counter(text_ngrams)
    language_ngram_count = Counter(language_ngrams)

    score = 0
    for ngram, count in text_ngram_count.items():
        if ngram in language_ngram_count:
            score += count * language_ngram_count[ngram]

    return score

def detect_language(text):

    cleaned_text = clean_text(text)

    text_ngrams = get_ngrams(cleaned_text)

    scores = {}
    for language, ngrams in language_ngrams.items():
        score = compare_ngrams(text_ngrams, ngrams)

        stopwords = language_stopwords.get(language, [])
        score += sum(1 for word in cleaned_text.split() if word in stopwords)
        scores[language] = score

    return max(scores, key=scores.get)

def main():

    text = ""
    try:
        while True:
            line = input().strip()
            if line:
                text += line + " "
            else:
                break
    except EOFError:
        pass


    language = detect_language(text)


    print(language)

if __name__ == "__main__":
    main()

In [None]:
# 5) The Missing Apostrophes
import re

def fix_apostrophes(text):
    # Handle contractions
    contractions = {
        "dont": "don't", "cant": "can't", "wont": "won't", "isnt": "isn't", "arent": "aren't",
        "wasnt": "wasn't", "werent": "weren't", "hasnt": "hasn't", "havent": "haven't",
        "hadnt": "hadn't", "couldnt": "couldn't", "shouldnt": "shouldn't", "wouldnt": "wouldn't",
        "didnt": "didn't", "im": "I'm", "ive": "I've", "youve": "you've", "theyve": "they've",
        "shes": "she's", "hes": "he's", "its": "it's", "id": "I'd", "ill": "I'll", "ill": "I'll",
        "ive": "I've", "youd": "you'd", "wed": "we'd", "they'd": "they'd", "theres": "there's",
        "whats": "what's", "heres": "here's", "lets": "let's", "thats": "that's"
    }

    # Replace contractions with apostrophes
    for word, replacement in contractions.items():
        text = re.sub(r'\b' + word + r'\b', replacement, text)

    # Fix possessive cases, i.e. partys -> party's
    text = re.sub(r'(\b\w+)\s+(?:partys|candidates|voters)\b', r'\1\'s', text)

    return text

# Input text
text = """At a news conference Thursday at the Russian manned-space facility in Baikonur, Kazakhstan, Kornienko said "we will be missing nature, we will be missing landscapes, woods." He admitted that on his previous trip into space in 2010 "I even asked our psychological support folks to send me a calendar with photographs of nature, of rivers, of woods, of lakes."
Kelly was asked if hed miss his twin brother Mark, who also was an astronaut.
"Were used to this kind of thing," he said. "Ive gone longer without seeing him and it was great."
The mission wont be the longest time that a human has spent in space - four Russians spent a year or more aboard the Soviet-built Mir space station in the 1990s."""

# Apply function
fixed_text = fix_apostrophes(text)

# Output the fixed text
print(fixed_text)

In [None]:
# 6) Segment the Twitter Hashtags

import re


COMMON_WORDS = set([
    "we", "are", "the", "people", "mention", "your", "faves", "now", "playing",
    "the", "walking", "dead", "follow", "me"
])


def is_word(word):
    return word.lower() in COMMON_WORDS


def split_hashtag(hashtag):
    n = len(hashtag)
    dp = [False] * (n + 1)
    dp[0] = True
    split_points = [-1] * (n + 1)

    for i in range(1, n + 1):
        for j in range(i):
            if dp[j] and is_word(hashtag[j:i]):
                dp[i] = True
                split_points[i] = j
                break

    if not dp[-1]:
        return hashtag
    words = []
    idx = n
    while idx > 0:
        prev_idx = split_points[idx]
        words.append(hashtag[prev_idx:idx])
        idx = prev_idx

    return " ".join(reversed(words))


def main():

    n = int(input())
    hashtags = [input().strip() for _ in range(n)]


    for hashtag in hashtags:
        print(split_hashtag(hashtag))


if __name__ == "__main__":
    main()

In [None]:
# 7) Expand the Acronyms
import re

def extract_acronyms_and_expansions(snippets):
    """Extract acronyms and their expansions from the provided snippets."""
    acronym_dict = {}
    for snippet in snippets:
        matches = re.findall(r'\b([A-Z][A-Z0-9]+)\b|\(([A-Z][A-Z0-9]+)\)', snippet)
        for match in matches:
            acronym = match[0] if match[0] else match[1]
            pattern = rf"([\w\s]+)\s+\(?{acronym}\)?"
            expansion_match = re.search(pattern, snippet)
            if expansion_match:
                expansion = expansion_match.group(1).strip()
                acronym_dict[acronym] = expansion
    return acronym_dict

def main():
    n = int(input().strip())
    snippets = [input().strip() for _ in range(n)]
    tests = [input().strip() for _ in range(n)]

    acronym_dict = extract_acronyms_and_expansions(snippets)


    for test in tests:
        print(acronym_dict.get(test, "Not Found"))


if __name__ == "__main__":
    main()

In [None]:
# 9) A Text-Processing Warmup
import re

def count_articles_and_dates(text):
    """Count the occurrences of articles and dates in the given text."""
    a_pattern = r'\ba\b'
    an_pattern = r'\ban\b'
    the_pattern = r'\bthe\b'

    date_patterns = [
        r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
        r'\b\d{1,2}(st|nd|rd|th)?\s+[A-Za-z]+\s+\d{2,4}\b',
        r'\b\d{1,2}(st|nd|rd|th)?\s+of\s+[A-Za-z]+,\s+\d{4}\b',
        r'\b[A-Za-z]+\s+\d{1,2}(st|nd|rd|th)?,\s+\d{4}\b'
    ]

    count_a = len(re.findall(a_pattern, text, re.IGNORECASE))
    count_an = len(re.findall(an_pattern, text, re.IGNORECASE))
    count_the = len(re.findall(the_pattern, text, re.IGNORECASE))

    count_dates = 0
    for pattern in date_patterns:
        count_dates += len(re.findall(pattern, text))

    return count_a, count_an, count_the, count_dates

def main():
    try:
        t = int(input().strip())
    except EOFError:
        return

    fragments = []

    for _ in range(t):
        try:
            fragment = input().strip()
            fragments.append(fragment)
            input()
        except EOFError:
            break


    results = []
    for fragment in fragments:
        count_a, count_an, count_the, count_dates = count_articles_and_dates(fragment)
        results.extend([count_a, count_an, count_the, count_dates])

    for result in results:
        print(result)

if __name__ == "__main__":
    main()

In [None]:
# 10) Who is it?
import re

def resolve_pronouns(text, entities):
    """
    Resolves highlighted pronouns in the text to corresponding entities.

    Parameters:
        text (str): The input text containing highlighted pronouns.
        entities (list): List of possible entities.

    Returns:
        list: Resolved entities for each highlighted pronoun.
    """
    pronoun_pattern = r'\*\*(.*?)\*\*'
    pronouns = re.findall(pronoun_pattern, text)

    entity_mentions = {}
    resolved_entities = []

    sentences = re.split(r'[.!?]', text)

    for sentence in sentences:
        for entity in entities:
            if entity in sentence:
                entity_mentions[entity] = len(entity_mentions)

        for match in re.finditer(pronoun_pattern, sentence):
            pronoun = match.group(1)

            if pronoun.lower() in ['he', 'his', 'him']:
                possible_entities = [e for e in entity_mentions if re.search(r'\b[A-Z][a-z]+\b', e)]
            elif pronoun.lower() in ['she', 'her']:
                possible_entities = [e for e in entity_mentions if re.search(r'\b[A-Z][a-z]+\b', e)]
            elif pronoun.lower() in ['it', 'its']:
                possible_entities = [e for e in entity_mentions if not re.search(r'\b[A-Z][a-z]+\b', e)]
            else:
                possible_entities = entities

            if possible_entities:
                resolved_entity = max(possible_entities, key=lambda x: entity_mentions.get(x, -1))
                resolved_entities.append(resolved_entity)
            else:
                resolved_entities.append("Unknown")

    return resolved_entities


def main():
    n = int(input().strip())
    text_lines = [input().strip() for _ in range(n)]
    entities = input().strip().split(';')

    text = " ".join(text_lines)

    results = resolve_pronouns(text, entities)

    for result in results:
        print(result)


if __name__ == "__main__":
    main()