In [1]:
!pip install nltk gensim scikit-learn matplotlib pandas numpy


Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# ============================================================================
# TASK 1: CUSTOM STOPWORD REMOVAL
# ============================================================================

import re
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Sample text for stopword removal
sample_text = """
Natural language processing is a fascinating field. It involves many techniques
like tokenization, stemming, and classification. The results can be very useful
for many applications in the real world.
"""

# Create custom stopword list (different from NLTK)
custom_stopwords = {
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
    'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
    'to', 'was', 'were', 'will', 'with', 'can', 'very', 'like', 'many'
}

def preprocess_text(text, remove_stopwords=True):
    """Preprocess text with optional stopword removal"""
    # Convert to lowercase and remove punctuation
    text = re.sub(r'[^\w\s]', '', text.lower())
    words = text.split()

    if remove_stopwords:
        words = [word for word in words if word not in custom_stopwords]

    return words

# Compare with and without stopwords
print("TASK 1: Custom Stopword Removal")
print("="*50)
words_with_stopwords = preprocess_text(sample_text, remove_stopwords=False)
words_without_stopwords = preprocess_text(sample_text, remove_stopwords=True)

print("Original text:")
print(sample_text)
print(f"\nWith stopwords ({len(words_with_stopwords)} words):")
print(" ".join(words_with_stopwords))
print(f"\nWithout stopwords ({len(words_without_stopwords)} words):")
print(" ".join(words_without_stopwords))
print(f"Reduction: {len(words_with_stopwords) - len(words_without_stopwords)} words removed")


TASK 1: Custom Stopword Removal
Original text:

Natural language processing is a fascinating field. It involves many techniques 
like tokenization, stemming, and classification. The results can be very useful 
for many applications in the real world.


With stopwords (29 words):
natural language processing is a fascinating field it involves many techniques like tokenization stemming and classification the results can be very useful for many applications in the real world

Without stopwords (15 words):
natural language processing fascinating field involves techniques tokenization stemming classification results useful applications real world
Reduction: 14 words removed


In [2]:
# ============================================================================
# TASK 2: VOCABULARY BUILDER - TERM-DOCUMENT MATRIX
# ============================================================================

print("\n\nTASK 2: Vocabulary Builder - Term-Document Matrix")
print("="*50)

# 10 short sentences for vocabulary building
sentences = [
    "I love programming with Python",
    "Machine learning is very interesting",
    "Python is great for data science",
    "I enjoy working with data",
    "Machine learning uses Python frequently",
    "Data science requires programming skills",
    "Python programming is fun and useful",
    "I love data analysis and visualization",
    "Machine learning algorithms are powerful",
    "Programming with Python is efficient"
]

# Build term-document matrix
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
term_doc_matrix = vectorizer.fit_transform(sentences)

# Get feature names (vocabulary)
vocab = vectorizer.get_feature_names_out()

# Create DataFrame for better visualization
td_matrix_df = pd.DataFrame(
    term_doc_matrix.toarray(),
    columns=vocab,
    index=[f"Doc_{i+1}" for i in range(len(sentences))]
)

print("Term-Document Matrix:")
print(td_matrix_df)

print(f"\nVocabulary size: {len(vocab)}")
print(f"Total documents: {len(sentences)}")

# Word distribution analysis
word_frequencies = np.sum(term_doc_matrix.toarray(), axis=0)
word_freq_df = pd.DataFrame({
    'Word': vocab,
    'Frequency': word_frequencies
}).sort_values('Frequency', ascending=False)

print("\nTop 10 most frequent words:")
print(word_freq_df.head(10))




TASK 2: Vocabulary Builder - Term-Document Matrix
Term-Document Matrix:
        algorithms  analysis  data  efficient  enjoy  frequently  fun  great  \
Doc_1            0         0     0          0      0           0    0      0   
Doc_2            0         0     0          0      0           0    0      0   
Doc_3            0         0     1          0      0           0    0      1   
Doc_4            0         0     1          0      1           0    0      0   
Doc_5            0         0     0          0      0           1    0      0   
Doc_6            0         0     1          0      0           0    0      0   
Doc_7            0         0     0          0      0           0    1      0   
Doc_8            0         1     1          0      0           0    0      0   
Doc_9            1         0     0          0      0           0    0      0   
Doc_10           0         0     0          1      0           0    0      0   

        interesting  learning  ...  powerful 

In [3]:
# ============================================================================
# TASK 3: MINI SENTIMENT CLASSIFIER (NAIVE BAYES)
# ============================================================================

print("\n\nTASK 3: Mini Sentiment Classifier (Naive Bayes)")
print("="*50)

# Dataset of positive and negative reviews
positive_reviews = [
    "This movie is absolutely fantastic and amazing",
    "I loved every moment of this wonderful film",
    "Brilliant acting and excellent storyline",
    "Outstanding performance by all actors",
    "This is the best movie I have ever seen",
    "Incredible cinematography and great direction",
    "Perfect blend of action and emotion",
    "Highly recommend this masterpiece",
    "Exceptional quality and superb entertainment",
    "Amazing experience and wonderful journey"
]

negative_reviews = [
    "This movie is terrible and boring",
    "Worst film I have ever watched",
    "Poor acting and weak storyline",
    "Completely disappointed with this movie",
    "Waste of time and money",
    "Horrible direction and bad script",
    "Terrible performance by actors",
    "Boring and predictable plot",
    "Very disappointing and poorly made",
    "Awful movie with no entertainment value"
]

# Prepare training data
X_train = positive_reviews + negative_reviews
y_train = [1] * len(positive_reviews) + [0] * len(negative_reviews)  # 1 = positive, 0 = negative

# Vectorize the text
sentiment_vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X_train_vec = sentiment_vectorizer.fit_transform(X_train)

# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vec, y_train)

# Test on unseen sentences
test_sentences = [
    "This is an excellent and wonderful movie",
    "Terrible acting and boring plot",
    "I really enjoyed this film",
    "Complete waste of time"
]

X_test_vec = sentiment_vectorizer.transform(test_sentences)
predictions = nb_classifier.predict(X_test_vec)
probabilities = nb_classifier.predict_proba(X_test_vec)

print("Sentiment Classification Results:")
for i, sentence in enumerate(test_sentences):
    sentiment = "Positive" if predictions[i] == 1 else "Negative"
    confidence = max(probabilities[i]) * 100
    print(f"'{sentence}' → {sentiment} (confidence: {confidence:.1f}%)")




TASK 3: Mini Sentiment Classifier (Naive Bayes)
Sentiment Classification Results:
'This is an excellent and wonderful movie' → Positive (confidence: 79.2%)
'Terrible acting and boring plot' → Negative (confidence: 95.8%)
'I really enjoyed this film' → Negative (confidence: 51.4%)
'Complete waste of time' → Negative (confidence: 81.7%)


In [5]:
# ============================================================================
# TASK 4: WORD2VEC MODEL TRAINING (Improved Version)
# ============================================================================

print("\n\nTASK 4: Word2Vec Model Training (Improved Version)")
print("="*60)

from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')

# -------------------------
# Initial toy dataset
# -------------------------
initial_sentences = [
    ["dog", "is", "a", "loyal", "pet", "animal"],
    ["cat", "likes", "to", "sleep", "and", "play"],
    ["dog", "loves", "playing", "fetch", "outside"],
    ["cat", "enjoys", "hunting", "small", "mice"],
    ["pet", "animals", "bring", "joy", "to", "families"],
    ["dog", "barks", "when", "strangers", "approach"],
    ["cat", "purrs", "when", "happy", "and", "content"],
    ["animals", "need", "care", "and", "attention"]
]

# -------------------------
# Train initial model
# -------------------------
print("Training initial Word2Vec model...")
initial_model = Word2Vec(
    sentences=initial_sentences,
    vector_size=20,   # smaller size for tiny dataset
    window=5,         # larger context window
    min_count=1,
    workers=1,
    epochs=200        # more training passes
)

print("Most similar words to 'dog':")
try:
    dog_similar = initial_model.wv.most_similar('dog', topn=3)
    for word, score in dog_similar:
        print(f"  {word}: {score:.3f}")
except KeyError:
    print("  'dog' not found in vocabulary")

print("\nMost similar words to 'cat':")
try:
    cat_similar = initial_model.wv.most_similar('cat', topn=3)
    for word, score in cat_similar:
        print(f"  {word}: {score:.3f}")
except KeyError:
    print("  'cat' not found in vocabulary")

# -------------------------
# Additional sentences with puppy/kitten (extra reinforcement)
# -------------------------
additional_sentences = [
    ["puppy", "is", "a", "young", "dog"],
    ["kitten", "is", "a", "baby", "cat"],
    ["puppy", "loves", "to", "play", "and", "learn"],
    ["kitten", "enjoys", "milk", "and", "soft", "toys"],
    ["pet", "puppy", "brings", "happiness", "home"],
    ["animal", "kitten", "is", "very", "cute"],
    ["dog", "and", "puppy", "are", "best", "friends"],
    ["cat", "teaches", "kitten", "to", "hunt"],

    # Reinforcement sentences for stronger similarity
    ["a", "puppy", "grows", "into", "a", "dog"],
    ["a", "kitten", "grows", "into", "a", "cat"],
    ["puppy", "and", "dog", "play", "together"],
    ["kitten", "and", "cat", "sleep", "together"]
]

# -------------------------
# Incremental training on top of initial model
# -------------------------
print("\nUpdating model with puppy/kitten sentences...")
initial_model.build_vocab(additional_sentences, update=True)
initial_model.train(additional_sentences, total_examples=len(additional_sentences), epochs=200)

# -------------------------
# Results after expansion
# -------------------------
print("\nAfter adding puppy/kitten sentences:")
print("Most similar words to 'dog':")
try:
    dog_similar_new = initial_model.wv.most_similar('dog', topn=3)
    for word, score in dog_similar_new:
        print(f"  {word}: {score:.3f}")
except KeyError:
    print("  'dog' not found in vocabulary")

print("\nMost similar words to 'cat':")
try:
    cat_similar_new = initial_model.wv.most_similar('cat', topn=3)
    for word, score in cat_similar_new:
        print(f"  {word}: {score:.3f}")
except KeyError:
    print("  'cat' not found in vocabulary")

# -------------------------
# Check puppy-dog and kitten-cat similarity
# -------------------------
try:
    puppy_dog_sim = initial_model.wv.similarity('dog', 'puppy')
    kitten_cat_sim = initial_model.wv.similarity('cat', 'kitten')
    print(f"\nSimilarity between 'dog' and 'puppy': {puppy_dog_sim:.3f}")
    print(f"Similarity between 'cat' and 'kitten': {kitten_cat_sim:.3f}")
except KeyError as e:
    print(f"Error calculating similarity: {e}")




TASK 4: Word2Vec Model Training (Improved Version)
Training initial Word2Vec model...




Most similar words to 'dog':
  happy: 0.577
  when: 0.515
  attention: 0.512

Most similar words to 'cat':
  care: 0.508
  strangers: 0.368
  play: 0.328

Updating model with puppy/kitten sentences...

After adding puppy/kitten sentences:
Most similar words to 'dog':
  brings: 0.800
  is: 0.772
  puppy: 0.770

Most similar words to 'cat':
  brings: 0.811
  learn: 0.725
  pet: 0.718

Similarity between 'dog' and 'puppy': 0.770
Similarity between 'cat' and 'kitten': 0.669


In [6]:
# ============================================================================
# TASK 5: TEXT NORMALIZATION CHALLENGE
# ============================================================================

print("\n\nTASK 5: Text Normalization Challenge")
print("="*50)

# Install nltk for stemming/lemmatization (uncomment if needed)
# !pip install nltk
import nltk
try:
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    from nltk.corpus import wordnet
    # Download required NLTK data (uncomment if running for first time)
    nltk.download('wordnet')
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    nltk_available = True
except:
    nltk_available = False
    print("NLTK not available, using basic normalization")

# Messy sentences with typos, punctuation, and mixed casing
messy_sentences = [
    "ThIs iS a VeRy MeSSy SeNTeNcE!!! with lots of PuNcTuAtIoN???",
    "programming   IS    really  INTERESTING,,,, but can be challenging...",
    "I LOvE   data science & machine learning!!!",
    "WHY are there SO many  SPACEs   and CAPS????"
]

def clean_text(text):
    """Comprehensive text cleaning function"""
    # Convert to lowercase
    text = text.lower()

    # Remove extra punctuation (keep only letters, numbers, spaces)
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    # Strip leading/trailing spaces
    text = text.strip()

    return text

def tokenize_text(text):
    """Simple tokenization"""
    return text.split()

def stem_words(words):
    """Apply stemming to words"""
    if nltk_available:
        stemmer = PorterStemmer()
        return [stemmer.stem(word) for word in words]
    else:
        # Simple suffix removal as fallback
        stemmed = []
        for word in words:
            if word.endswith('ing'):
                stemmed.append(word[:-3])
            elif word.endswith('ed'):
                stemmed.append(word[:-2])
            elif word.endswith('s') and len(word) > 3:
                stemmed.append(word[:-1])
            else:
                stemmed.append(word)
        return stemmed

def normalize_text_pipeline(text):
    """Complete text normalization pipeline"""
    print(f"Original: {text}")

    # Step 1: Clean text
    cleaned = clean_text(text)
    print(f"Cleaned: {cleaned}")

    # Step 2: Tokenize
    tokens = tokenize_text(cleaned)
    print(f"Tokenized: {tokens}")

    # Step 3: Stem
    stemmed = stem_words(tokens)
    print(f"Stemmed: {stemmed}")

    # Step 4: Reconstruct
    normalized = ' '.join(stemmed)
    print(f"Final: {normalized}")
    print("-" * 60)

    return normalized

print("Text Normalization Results:")
print("=" * 60)

normalized_texts = []
for sentence in messy_sentences:
    normalized = normalize_text_pipeline(sentence)
    normalized_texts.append(normalized)




TASK 5: Text Normalization Challenge


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Text Normalization Results:
Original: ThIs iS a VeRy MeSSy SeNTeNcE!!! with lots of PuNcTuAtIoN???
Cleaned: this is a very messy sentence with lots of punctuation
Tokenized: ['this', 'is', 'a', 'very', 'messy', 'sentence', 'with', 'lots', 'of', 'punctuation']
Stemmed: ['thi', 'is', 'a', 'veri', 'messi', 'sentenc', 'with', 'lot', 'of', 'punctuat']
Final: thi is a veri messi sentenc with lot of punctuat
------------------------------------------------------------
Original: programming   IS    really  INTERESTING,,,, but can be challenging...
Cleaned: programming is really interesting but can be challenging
Tokenized: ['programming', 'is', 'really', 'interesting', 'but', 'can', 'be', 'challenging']
Stemmed: ['program', 'is', 'realli', 'interest', 'but', 'can', 'be', 'challeng']
Final: program is realli interest but can be challeng
------------------------------------------------------------
Original: I LOvE   data science & machine learning!!!
Cleaned: i love data science machine learning

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [7]:
# ============================================================================
# TASK 6: FAKE NEWS HEADLINE DETECTOR (LOGISTIC REGRESSION)
# ============================================================================

print("\n\nTASK 6: Fake News Headline Detector")
print("="*50)

# Dataset of headlines (10 true, 10 fake)
true_headlines = [
    "Scientists discover new species of deep-sea fish",
    "Stock market closes higher after economic report",
    "New vaccine shows promising results in clinical trials",
    "City council approves budget for infrastructure improvements",
    "University researchers develop more efficient solar panels",
    "International trade agreement signed between two countries",
    "Weather service issues flood warning for coastal areas",
    "Technology company announces quarterly earnings report",
    "Archaeological team uncovers ancient artifacts in excavation",
    "Government releases new guidelines for public health"
]

fake_headlines = [
    "Aliens secretly control world governments reveals insider",
    "Miracle cure for all diseases hidden by pharmaceutical companies",
    "Time traveler from 2050 warns about upcoming disasters",
    "Scientists prove earth is actually flat using new technology",
    "Celebrities are actually reptilian shapeshifters says expert",
    "Government plans to replace all birds with surveillance drones",
    "Ancient pyramid discovered to be giant alien spaceship",
    "Drinking water from specific location grants immortality",
    "Secret society controls all world events through mind control",
    "Billionaire admits to being vampire in leaked recording"
]

# Prepare training data for fake news detection
X_news = true_headlines + fake_headlines
y_news = [1] * len(true_headlines) + [0] * len(fake_headlines)  # 1 = true, 0 = fake

# Vectorize headlines using TF-IDF
news_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=100)
X_news_vec = news_vectorizer.fit_transform(X_news)

# Split data for training and testing
X_train_news, X_test_news, y_train_news, y_test_news = train_test_split(
    X_news_vec, y_news, test_size=0.3, random_state=42, stratify=y_news
)

# Train Logistic Regression classifier
lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(X_train_news, y_train_news)

# Evaluate on test set
y_pred_news = lr_classifier.predict(X_test_news)
accuracy = accuracy_score(y_test_news, y_pred_news)

print(f"Fake News Detector Accuracy: {accuracy:.2f}")

# Test on new headlines
new_headlines = [
    "Researchers develop new method for cancer treatment",
    "Unicorns found living in government secret facility",
    "Company releases updated software with security improvements",
    "Moon landing was staged on Hollywood movie set"
]

X_new_vec = news_vectorizer.transform(new_headlines)
new_predictions = lr_classifier.predict(X_new_vec)
new_probabilities = lr_classifier.predict_proba(X_new_vec)

print("\nPredictions for new headlines:")
for i, headline in enumerate(new_headlines):
    label = "TRUE" if new_predictions[i] == 1 else "FAKE"
    confidence = max(new_probabilities[i]) * 100
    print(f"'{headline[:50]}...' → {label} (confidence: {confidence:.1f}%)")




TASK 6: Fake News Headline Detector
Fake News Detector Accuracy: 0.33

Predictions for new headlines:
'Researchers develop new method for cancer treatmen...' → TRUE (confidence: 57.2%)
'Unicorns found living in government secret facilit...' → FAKE (confidence: 55.5%)
'Company releases updated software with security im...' → TRUE (confidence: 55.1%)
'Moon landing was staged on Hollywood movie set...' → TRUE (confidence: 50.1%)


In [8]:
# ============================================================================
# TASK 7: BUILD A TINY SEARCH ENGINE
# ============================================================================

print("\n\nTASK 7: Tiny Search Engine with TF-IDF")
print("="*50)

# Small document collection
documents = [
    "Python is a powerful programming language for data science",
    "Machine learning algorithms can solve complex problems",
    "Data visualization helps understand patterns in datasets",
    "Natural language processing analyzes text and speech",
    "Deep learning models require large amounts of training data",
    "Statistical analysis provides insights from numerical data",
    "Computer vision processes and analyzes digital images",
    "Big data technologies handle massive information volumes"
]

class SimpleSearchEngine:
    def __init__(self, documents):
        self.documents = documents
        self.vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
        self.tfidf_matrix = self.vectorizer.fit_transform(documents)

    def search(self, query, top_k=3):
        """Search for most relevant documents"""
        # Transform query using the same vectorizer
        query_vec = self.vectorizer.transform([query])

        # Calculate cosine similarity
        from sklearn.metrics.pairwise import cosine_similarity
        similarities = cosine_similarity(query_vec, self.tfidf_matrix).flatten()

        # Get top k most similar documents
        top_indices = similarities.argsort()[-top_k:][::-1]

        results = []
        for idx in top_indices:
            if similarities[idx] > 0:  # Only return relevant results
                results.append({
                    'document': self.documents[idx],
                    'score': similarities[idx],
                    'index': idx
                })

        return results

# Create search engine
search_engine = SimpleSearchEngine(documents)

# Test queries
test_queries = [
    "machine learning data",
    "image processing vision",
    "python programming",
    "statistical analysis"
]

print("Search Engine Results:")
print("=" * 60)

for query in test_queries:
    print(f"Query: '{query}'")
    results = search_engine.search(query)

    if results:
        for i, result in enumerate(results):
            print(f"  {i+1}. Score: {result['score']:.3f}")
            print(f"     Document: {result['document']}")
    else:
        print("  No relevant documents found")
    print("-" * 40)




TASK 7: Tiny Search Engine with TF-IDF
Search Engine Results:
Query: 'machine learning data'
  1. Score: 0.502
     Document: Machine learning algorithms can solve complex problems
  2. Score: 0.270
     Document: Deep learning models require large amounts of training data
  3. Score: 0.099
     Document: Python is a powerful programming language for data science
----------------------------------------
Query: 'image processing vision'
  1. Score: 0.304
     Document: Natural language processing analyzes text and speech
  2. Score: 0.296
     Document: Computer vision processes and analyzes digital images
----------------------------------------
Query: 'python programming'
  1. Score: 0.631
     Document: Python is a powerful programming language for data science
----------------------------------------
Query: 'statistical analysis'
  1. Score: 0.613
     Document: Statistical analysis provides insights from numerical data
----------------------------------------


In [9]:
# ============================================================================
# TASK 8: EMOJI PREDICTOR
# ============================================================================

print("\n\nTASK 8: Emoji Predictor")
print("="*50)

# Training data with sentences and corresponding emojis
emoji_training_data = [
    ("I am very happy today", "😊"),
    ("This is so exciting and amazing", "🎉"),
    ("I love this wonderful day", "❤️"),
    ("That was really scary", "😱"),
    ("I am so angry about this", "😠"),
    ("This makes me very sad", "😢"),
    ("I am feeling tired", "😴"),
    ("This is absolutely hilarious", "😂"),
    ("I am surprised by this news", "😮"),
    ("This food looks delicious", "🤤"),
    ("I am confused about this", "😕"),
    ("This weather is perfect", "☀️"),
    ("I am worried about tomorrow", "😟"),
    ("This party was fantastic", "🎊"),
    ("I feel proud of my achievement", "😌"),
    ("This situation is frustrating", "😤"),
    ("I am grateful for everything", "🙏"),
    ("This movie was boring", "😑"),
    ("I am excited for vacation", "✈️"),
    ("This music is great", "🎵")
]

# Prepare data
emoji_sentences = [item[0] for item in emoji_training_data]
emoji_labels = [item[1] for item in emoji_training_data]

# Create label encoder for emojis
from sklearn.preprocessing import LabelEncoder
emoji_encoder = LabelEncoder()
encoded_labels = emoji_encoder.fit_transform(emoji_labels)

# Vectorize sentences
emoji_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_emoji = emoji_vectorizer.fit_transform(emoji_sentences)

# Train classifier
emoji_classifier = MultinomialNB()
emoji_classifier.fit(X_emoji, encoded_labels)

# Test on new sentences
test_sentences_emoji = [
    "I feel extremely joyful",
    "This situation terrifies me",
    "I am very sleepy right now",
    "This joke is so funny",
    "The weather is beautiful today"
]

X_test_emoji = emoji_vectorizer.transform(test_sentences_emoji)
emoji_predictions = emoji_classifier.predict(X_test_emoji)
predicted_emojis = emoji_encoder.inverse_transform(emoji_predictions)

print("Emoji Prediction Results:")
for i, sentence in enumerate(test_sentences_emoji):
    print(f"'{sentence}' → {predicted_emojis[i]}")




TASK 8: Emoji Predictor
Emoji Prediction Results:
'I feel extremely joyful' → 😌
'This situation terrifies me' → 😤
'I am very sleepy right now' → ☀️
'This joke is so funny' → ☀️
'The weather is beautiful today' → ☀️


In [10]:
# ============================================================================
# TASK 9: AUTHOR STYLE DETECTION
# ============================================================================

print("\n\nTASK 9: Author Style Detection")
print("="*50)

# Shakespeare samples (simplified/modernized for demo)
shakespeare_samples = [
    "to be or not to be that is the question",
    "all the world is a stage and all men and women merely players",
    "what light through yonder window breaks it is the east",
    "fair is foul and foul is fair hover through fog and filthy air",
    "now is the winter of our discontent made glorious summer",
    "double double toil and trouble fire burn and cauldron bubble",
    "out out brief candle life is but walking shadow",
    "friends romans countrymen lend me your ears",
    "cowards die many times before their deaths valiant never taste death",
    "lord what fools these mortals be"
]

# Jane Austen samples (simplified)
austen_samples = [
    "it is truth universally acknowledged that single man in possession good fortune",
    "pride relates more to our opinion of ourselves vanity to what we would have others think",
    "happiness in marriage is entirely matter of chance",
    "there is nothing like staying at home for real comfort",
    "silly things do cease to be silly if they are done by sensible people",
    "one half of the world cannot understand pleasures of the other",
    "woman especially if she have misfortune of knowing anything should conceal it",
    "friendship is certainly finest balm for pangs of disappointed love",
    "nothing is more deceitful than appearance of humility",
    "angry people are not always wise"
]

# Prepare author classification data
author_texts = shakespeare_samples + austen_samples
author_labels = [0] * len(shakespeare_samples) + [1] * len(austen_samples)  # 0 = Shakespeare, 1 = Austen

# Vectorize the texts
author_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=100)
X_authors = author_vectorizer.fit_transform(author_texts)

# Split data
X_train_authors, X_test_authors, y_train_authors, y_test_authors = train_test_split(
    X_authors, author_labels, test_size=0.3, random_state=42, stratify=author_labels
)

# Train classifier
author_classifier = LogisticRegression(random_state=42)
author_classifier.fit(X_train_authors, y_train_authors)

# Evaluate
y_pred_authors = author_classifier.predict(X_test_authors)
author_accuracy = accuracy_score(y_test_authors, y_pred_authors)

print(f"Author Classification Accuracy: {author_accuracy:.2f}")

# Test on new texts
new_texts = [
    "something is rotten in state of denmark",  # Shakespeare style
    "young lady of seventeen cannot be really serious yet",  # Austen style
    "brevity is soul of wit",  # Shakespeare
    "nothing ever fatigues me but doing what I do not like"  # Austen
]

X_new_authors = author_vectorizer.transform(new_texts)
author_predictions = author_classifier.predict(X_new_authors)
author_probabilities = author_classifier.predict_proba(X_new_authors)

print("\nAuthor Prediction Results:")
authors = ["Shakespeare", "Austen"]
for i, text in enumerate(new_texts):
    predicted_author = authors[author_predictions[i]]
    confidence = max(author_probabilities[i]) * 100
    print(f"'{text}' → {predicted_author} (confidence: {confidence:.1f}%)")




TASK 9: Author Style Detection
Author Classification Accuracy: 0.67

Author Prediction Results:
'something is rotten in state of denmark' → Shakespeare (confidence: 50.0%)
'young lady of seventeen cannot be really serious yet' → Shakespeare (confidence: 50.0%)
'brevity is soul of wit' → Shakespeare (confidence: 50.0%)
'nothing ever fatigues me but doing what I do not like' → Austen (confidence: 54.5%)


In [11]:
# ============================================================================
# TASK 10: CREATIVE GENERATION WITH N-GRAM MODEL
# ============================================================================

print("\n\nTASK 10: Creative Text Generation with N-gram Model")
print("="*50)

# Fairy tale text corpus
fairy_tale_text = """
once upon a time there was a beautiful princess who lived in a tall tower.
the princess had long golden hair that sparkled in the sunlight.
every day the princess would look out of her window hoping for rescue.
one day a brave knight came riding through the forest on his white horse.
the knight saw the tower and heard the princess singing sweetly.
he climbed up the tower using the princess long golden hair.
when they met they fell in love immediately and lived happily ever after.
the end of this magical fairy tale story.
"""

class NGramModel:
    def __init__(self, text, n=2):
        self.n = n
        self.ngrams = defaultdict(list)
        self.build_model(text)

    def build_model(self, text):
        """Build n-gram model from text"""
        # Clean and tokenize
        text = re.sub(r'[^\w\s]', '', text.lower())
        words = text.split()

        # Create n-grams
        for i in range(len(words) - self.n + 1):
            prefix = tuple(words[i:i + self.n - 1])
            next_word = words[i + self.n - 1]
            self.ngrams[prefix].append(next_word)

    def generate_text(self, seed_words, length=20):
        """Generate text using n-gram model"""
        import random

        # Start with seed words
        if isinstance(seed_words, str):
            current = tuple(seed_words.lower().split())
        else:
            current = tuple(seed_words)

        generated = list(current)

        for _ in range(length):
            if current in self.ngrams:
                # Choose next word randomly from possibilities
                next_word = random.choice(self.ngrams[current])
                generated.append(next_word)

                # Update current context
                current = current[1:] + (next_word,)
            else:
                # If no continuation found, restart with random n-gram
                if self.ngrams:
                    current = random.choice(list(self.ngrams.keys()))
                    generated.append(current[0])
                else:
                    break

        return ' '.join(generated)

# Create bigram model (n=2)
bigram_model = NGramModel(fairy_tale_text, n=2)

print("Generated Fairy Tale Stories:")
print("=" * 40)

# Generate multiple stories with different seeds
seeds = [
    "once upon",
    "the princess",
    "a brave",
    "they lived"
]

for seed in seeds:
    print(f"Seed: '{seed}'")
    story = bigram_model.generate_text(seed, length=15)
    print(f"Generated: {story}")
    print("-" * 40)

# Create trigram model (n=3) for comparison
print("\nTrigram Model Results:")
trigram_model = NGramModel(fairy_tale_text, n=3)

for seed in ["once upon a", "the princess had"]:
    print(f"Seed: '{seed}'")
    story = trigram_model.generate_text(seed, length=12)
    print(f"Generated: {story}")
    print("-" * 40)





TASK 10: Creative Text Generation with N-gram Model
Generated Fairy Tale Stories:
Seed: 'once upon'
Generated: once upon sunlight every day a tall tower the princess had long golden hair when they met
----------------------------------------
Seed: 'the princess'
Generated: the princess rescue one day a brave knight saw the forest on his white horse the end
----------------------------------------
Seed: 'a brave'
Generated: a brave knight came riding through the forest on his white horse the forest on his white
----------------------------------------
Seed: 'they lived'
Generated: they lived he climbed up the princess singing sweetly he climbed up the forest on his white
----------------------------------------

Trigram Model Results:
Seed: 'once upon a'
Generated: once upon a tower heard the princess long golden hair when they met they fell
----------------------------------------
Seed: 'the princess had'
Generated: the princess had a princess who lived in a tall tower the princess s

In [12]:
"""Complete NLP Tasks - Summary Comments
Task 1: Custom Stopword Removal
Purpose: Remove common words that don't add semantic value
Key Learning: Custom stopwords vs standard lists, impact on text processing
Output: Shows word count reduction and cleaner text for analysis
Task 2: Vocabulary Builder - Term-Document Matrix
Purpose: Create numerical representation of text documents
Key Learning: How words are distributed across documents, sparse matrix concepts
Output: DataFrame showing word frequencies per document, vocabulary statistics
Task 3: Mini Sentiment Classifier (Naive Bayes)
Purpose: Classify text as positive or negative sentiment
Key Learning: Supervised learning, probabilistic classification
Output: Accuracy scores and predictions on new sentences with confidence levels
Task 4: Word2Vec Model Training
Purpose: Learn word embeddings and semantic relationships
Key Learning: How adding related words improves semantic similarity
Output: Similar words to "dog"/"cat" before/after adding "puppy"/"kitten" data
Task 5: Text Normalization Challenge
Purpose: Clean messy text data for processing
Key Learning: Text preprocessing pipeline - cleaning, tokenizing, stemming
Output: Step-by-step transformation from messy to clean text
Task 6: Fake News Headline Detector
Purpose: Classify headlines as real or fake news
Key Learning: Binary classification, TF-IDF features, logistic regression
Output: Model accuracy and predictions on new headlines with confidence
Task 7: Tiny Search Engine
Purpose: Build document retrieval system using similarity
Key Learning: TF-IDF vectorization, cosine similarity for ranking
Output: Ranked relevant documents for different search queries
Task 8: Emoji Predictor
Purpose: Predict appropriate emoji for given text
Key Learning: Multi-class classification, text-to-symbol mapping
Output: Emoji predictions for emotional expressions in sentences
Task 9: Author Style Detection
Purpose: Identify writing style differences between authors
Key Learning: Stylistic analysis, feature extraction from writing patterns
Output: Classification of text samples as Shakespeare vs Jane Austen style
Task 10: Creative Text Generation
Purpose: Generate new text in similar style using statistical patterns
Key Learning: N-gram models, probability-based text generation
Output: Generated fairy tale sentences using bigram/trigram patterns"""

'Complete NLP Tasks - Summary Comments\nTask 1: Custom Stopword Removal\nPurpose: Remove common words that don\'t add semantic value\nKey Learning: Custom stopwords vs standard lists, impact on text processing\nOutput: Shows word count reduction and cleaner text for analysis\nTask 2: Vocabulary Builder - Term-Document Matrix\nPurpose: Create numerical representation of text documents\nKey Learning: How words are distributed across documents, sparse matrix concepts\nOutput: DataFrame showing word frequencies per document, vocabulary statistics\nTask 3: Mini Sentiment Classifier (Naive Bayes)\nPurpose: Classify text as positive or negative sentiment\nKey Learning: Supervised learning, probabilistic classification\nOutput: Accuracy scores and predictions on new sentences with confidence levels\nTask 4: Word2Vec Model Training\nPurpose: Learn word embeddings and semantic relationships\nKey Learning: How adding related words improves semantic similarity\nOutput: Similar words to "dog"/"cat"