<a href="https://colab.research.google.com/github/SnehaTanwar006/NLP/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

text = "Good morning all. This is a wonderful day"

sentences = sent_tokenize(text)
print(sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['Good morning all.', 'This is a wonderful day']


In [2]:
sentences[0]

'Good morning all.'

In [3]:
from nltk.tokenize import word_tokenize
words = word_tokenize(text)
print(words)

['Good', 'morning', 'all', '.', 'This', 'is', 'a', 'wonderful', 'day']


In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m97.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import spacy

# Load English tokenizer, tagger, parser, NER, etc.
nlp = spacy.load("en_core_web_sm")

text = "Good morning all. This is a wonderful day"

# Process the text
doc = nlp(text)

# Extract sentences
sentences = [sent.text for sent in doc.sents]

print(sentences)

# Extract tokens (words and punctuations)
words = [token.text for token in doc]

print(words)

['Good morning all.', 'This is a wonderful day']
['Good', 'morning', 'all', '.', 'This', 'is', 'a', 'wonderful', 'day']


BoW Method

In [6]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [8]:
# Sample text (you can use a list of texts for BoW)
texts = [
    "New Delhi is Capital of India",
    "This is student alpha learning NLP.",
    "Today it may be raining outside"
]

In [9]:
print("\nOriginal Texts:")
for t in texts:
    print("-", t)


Original Texts:
- New Delhi is Capital of India
- This is student alpha learning NLP.
- Today it may be raining outside


In [10]:
# ----- Stage 1: Lowercasing -----
lowercased_texts = [text.lower() for text in texts]
print("\nAfter Lowercasing:")
for t in lowercased_texts:
    print("-", t)


After Lowercasing:
- new delhi is capital of india
- this is student alpha learning nlp.
- today it may be raining outside


In [11]:
# ----- Stage 2: Tokenization -----
tokenized_texts = []
for text in lowercased_texts:
    doc = nlp(text)
    tokens = [token.text for token in doc]
    tokenized_texts.append(tokens)

print("\nAfter Tokenization:")
for tokens in tokenized_texts:
    print("-", tokens)


After Tokenization:
- ['new', 'delhi', 'is', 'capital', 'of', 'india']
- ['this', 'is', 'student', 'alpha', 'learning', 'nlp', '.']
- ['today', 'it', 'may', 'be', 'raining', 'outside']


In [12]:
# ----- Stage 3: Stop Word Removal -----
no_stop_texts = []
for tokens in tokenized_texts:
    no_stop = [token for token in tokens if not nlp.vocab[token].is_stop and token.isalpha()]
    no_stop_texts.append(no_stop)

print("\nAfter Stop Word Removal:")
for tokens in no_stop_texts:
    print("-", tokens)


After Stop Word Removal:
- ['new', 'delhi', 'capital', 'india']
- ['student', 'alpha', 'learning', 'nlp']
- ['today', 'raining', 'outside']


In [13]:
# ----- Stage 4: Lemmatization -----
lemmatized_texts = []
for tokens in no_stop_texts:
    doc = nlp(' '.join(tokens))  # spaCy needs string input
    lemmas = [token.lemma_ for token in doc]
    lemmatized_texts.append(lemmas)

print("\nAfter Lemmatization:")
for lemmas in lemmatized_texts:
    print("-", lemmas)


After Lemmatization:
- ['new', 'delhi', 'capital', 'india']
- ['student', 'alpha', 'learn', 'nlp']
- ['today', 'rain', 'outside']


In [14]:
# ----- Stage 5: Join for BoW -----
cleaned_texts = [' '.join(lemmas) for lemmas in lemmatized_texts]
print("\nFinal Cleaned Texts (for BoW):")
for t in cleaned_texts:
    print("-", t)


Final Cleaned Texts (for BoW):
- new delhi capital india
- student alpha learn nlp
- today rain outside


In [15]:
# ----- Stage 6: Bag-of-Words -----
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(cleaned_texts)

print("\nBoW Vocabulary:", vectorizer.get_feature_names_out())
print("\nBoW Matrix:\n", bow_matrix.toarray())


BoW Vocabulary: ['alpha' 'capital' 'delhi' 'india' 'learn' 'new' 'nlp' 'outside' 'rain'
 'student' 'today']

BoW Matrix:
 [[0 1 1 1 0 1 0 0 0 0 0]
 [1 0 0 0 1 0 1 0 0 1 0]
 [0 0 0 0 0 0 0 1 1 0 1]]


#Sentiment Analysis using BoW

In [16]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [17]:
# positive and negative sentences
texts = [
    # Positive
    "I love this product",
    "This is a great movie",
    "I enjoyed the food here",
    "The service was excellent",
    "I am happy with my purchase",
    "This place is wonderful",
    "Such a pleasant experience",
    "Staff were friendly and helpful",
    "The quality is amazing",
    "Highly recommend this place",
    # Negative
    "I dislike this place",
    "This was a bad experience",
    "The food was terrible",
    "Service was slow",
    "I am unhappy with my purchase",
    "I regret coming here",
    "The movie was boring",
    "Staff were rude and unhelpful",
    "Poor quality",
    "Will not recommend this place"
]

In [18]:
labels = [1]*10 + [0]*10  # 1 = positive, 0 = negative

In [19]:
# 2. Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [20]:
# Preprocessing: Lowercasing, Tokenization, Stopword Removal, Lemmatization
cleaned_texts = []
for text in texts:
    doc = nlp(text.lower())
    words = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    cleaned_text = ' '.join(words)
    cleaned_texts.append(cleaned_text)

In [21]:
# Bag-of-Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(cleaned_texts)

# After fitting vectorizer:
print("Vocabulary (feature names):")
print(vectorizer.get_feature_names_out())
print()

print("BoW Matrix (rows = sentences, columns = features):")
print(X.toarray())
print()

print("Sentence-wise BoW vectors:")
for sent, vec in zip(cleaned_texts, X.toarray()):
    print(f"'{sent}' --> {vec}")

Vocabulary (feature names):
['amazing' 'bad' 'bore' 'come' 'dislike' 'enjoy' 'excellent' 'experience'
 'food' 'friendly' 'great' 'happy' 'helpful' 'highly' 'love' 'movie'
 'place' 'pleasant' 'poor' 'product' 'purchase' 'quality' 'recommend'
 'regret' 'rude' 'service' 'slow' 'staff' 'terrible' 'unhappy' 'unhelpful'
 'wonderful']

BoW Matrix (rows = sentences, columns = features):
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0

In [22]:
# Build & Train Classifier
clf = LogisticRegression()
clf.fit(X, labels)

In [23]:
# Test sentences
test_sentences = [
    "I love this movie",         # positive
    "I had a bad product",       # negative
    "This place is great",       # positive
    "I dislike this experience", # negative
    "Staff were helpful",        # positive
    "The movie was boring",      # negative
    "Amazing quality",           # positive
    "Food was terrible",         # negative
    "The service was pleasant",  # positive
    "I regret this purchase"     # negative
]

In [24]:
# Preprocess test data
cleaned_test = []
for text in test_sentences:
    doc = nlp(text.lower())
    words = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    cleaned = ' '.join(words)
    cleaned_test.append(cleaned)

In [25]:
# BoW for test sentences
X_test = vectorizer.transform(cleaned_test)

In [26]:
# Predict sentiment
predictions = clf.predict(X_test)

In [27]:
print("\nTest Results:")
for sent, pred in zip(test_sentences, predictions):
    label = "Positive" if pred == 1 else "Negative"
    print(f"'{sent}' --> {label}")


Test Results:
'I love this movie' --> Positive
'I had a bad product' --> Negative
'This place is great' --> Positive
'I dislike this experience' --> Negative
'Staff were helpful' --> Positive
'The movie was boring' --> Negative
'Amazing quality' --> Positive
'Food was terrible' --> Negative
'The service was pleasant' --> Positive
'I regret this purchase' --> Negative


#TF IDF

In [28]:
import numpy as np

docs = [
    "Earth revolves around sun",
    "Planets revolve around stars",
    "Sun is a star"
]

In [29]:
# Tokenize docs
tokens = [doc.lower().split() for doc in docs]
print(tokens)

[['earth', 'revolves', 'around', 'sun'], ['planets', 'revolve', 'around', 'stars'], ['sun', 'is', 'a', 'star']]


In [30]:
vocab = sorted(set(word for doc in tokens for word in doc))
print(vocab)

['a', 'around', 'earth', 'is', 'planets', 'revolve', 'revolves', 'star', 'stars', 'sun']


In [31]:
N = len(docs)
print(N)

3


In [32]:
def tf(word, doc):
    return doc.count(word) / len(doc)

In [33]:
def idf(word):
    df = sum(1 for doc in tokens if word in doc)
    return np.log10(N / df) if df != 0 else 0

In [34]:
# Build TF-IDF table
tfidf_table = []
for doc in tokens:
    row = []
    for word in vocab:
        tf_val = tf(word, doc)
        idf_val = idf(word)
        # Convert to regular float and round for pretty printing
        row.append(round(float(tf_val * idf_val), 3))
    tfidf_table.append(row)

# Print the results
print("Vocabulary:", vocab)
print("\nTF-IDF Matrix ():")
for i, row in enumerate(tfidf_table):
    print(f"Sentence {i+1}:", row)

Vocabulary: ['a', 'around', 'earth', 'is', 'planets', 'revolve', 'revolves', 'star', 'stars', 'sun']

TF-IDF Matrix ():
Sentence 1: [0.0, 0.044, 0.119, 0.0, 0.0, 0.0, 0.119, 0.0, 0.0, 0.044]
Sentence 2: [0.0, 0.044, 0.0, 0.0, 0.119, 0.119, 0.0, 0.0, 0.119, 0.0]
Sentence 3: [0.119, 0.0, 0.0, 0.119, 0.0, 0.0, 0.0, 0.119, 0.0, 0.044]


SMS Classification using NLP and ML

In [35]:
import pandas as pd
import zipfile
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [36]:
# 1. Download and extract dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
urllib.request.urlretrieve(url, "smsspamcollection.zip")
with zipfile.ZipFile("smsspamcollection.zip") as archive:
    with archive.open("SMSSpamCollection") as file:
        df = pd.read_csv(file, sep='\t', header=None, names=['label', 'message'])

In [37]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [38]:
# 2. Map labels
df['target'] = df['label'].map({'ham': 0, 'spam': 1})

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['target'], test_size=0.2, random_state=42, stratify=df['target'])

In [39]:
X_train

Unnamed: 0,message
184,"He will, you guys close?"
2171,CAN I PLEASE COME UP NOW IMIN TOWN.DONTMATTER ...
5422,Ok k..sry i knw 2 siva..tats y i askd..
4113,"I'll see, but prolly yeah"
4588,"I'll see if I can swing by in a bit, got some ..."
...,...
1932,What pa tell me.. I went to bath:-)
5316,Jus finish watching tv... U?
2309,Moby Pub Quiz.Win a £100 High Street prize if ...
1904,Free entry in 2 a weekly comp for a chance to ...


In [40]:
y_train

Unnamed: 0,target
184,0
2171,0
5422,0
4113,0
4588,0
...,...
1932,0
5316,0
2309,1
1904,1


In [41]:
# 4. Vectorization
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [42]:
# 5. Model training
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [43]:
# 6. Prediction & evaluation
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9605381165919282
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.70      0.83       149

    accuracy                           0.96      1115
   macro avg       0.98      0.85      0.90      1115
weighted avg       0.96      0.96      0.96      1115



In [44]:
# Example: List of new SMS messages to test
new_messages = [
    "Congratulations! You have won a free iPhone. Call now to claim.",
    "Hey, are we still meeting for coffee tomorrow?",
    "Urgent: Your account has been suspended. Click the link to reactivate.",
    "Reminder: Your bill payment is due tomorrow.",
]

# Transform the messages using the previously fitted TF-IDF vectorizer
new_messages_tfidf = tfidf.transform(new_messages)

# Predict using the trained model
predictions = model.predict(new_messages_tfidf)

# Map prediction output to label names
label_map = {0: "ham", 1: "spam"}
predicted_labels = [label_map[p] for p in predictions]

# Print results
for msg, label in zip(new_messages, predicted_labels):
    print(f"{label.upper()}: {msg}")

SPAM: Congratulations! You have won a free iPhone. Call now to claim.
HAM: Hey, are we still meeting for coffee tomorrow?
HAM: Urgent: Your account has been suspended. Click the link to reactivate.
HAM: Reminder: Your bill payment is due tomorrow.


N-grams

In [45]:
# Step 1: Import Libraries
import pandas as pd
import zipfile
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [46]:
# Step 2: Download and Load Dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
urllib.request.urlretrieve(url, "smsspamcollection.zip")
with zipfile.ZipFile("smsspamcollection.zip") as archive:
    with archive.open("SMSSpamCollection") as file:
        df = pd.read_csv(file, sep='\t', header=None, names=['label', 'message'])

In [47]:
# Step 3: Prepare Data
df['target'] = df['label'].map({'ham': 0, 'spam': 1})

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['target'], test_size=0.2, random_state=42, stratify=df['target']
)

In [48]:
# Step 5: TF-IDF Vectorization with n-grams
tfidf = TfidfVectorizer(ngram_range=(1,2))
#The parameter ngram_range controls what kind of "chunks" of words are extracted from the text.
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [49]:
# Step 6: Train Logistic Regression Model
model = LogisticRegression(class_weight='balanced', max_iter=200)
model.fit(X_train_tfidf, y_train)

In [50]:
# Step 7: Evaluate on Test Data
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9820627802690582
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.94      0.93      0.93       149

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [51]:
# Step 8: Predict on New Messages (with threshold tuning)
new_messages = [
    "Congratulations! You have won a free iPhone. Call now to claim.",
    "Hey, are we still meeting for coffee tomorrow?",
    "Urgent: Your account has been suspended. Click the link to reactivate.",
    "Reminder: Your bill payment is due tomorrow."
]
new_messages_tfidf = tfidf.transform(new_messages)

In [52]:
# Default prediction (threshold = 0.5)
predictions = model.predict(new_messages_tfidf)

In [53]:
label_map = {0: "ham", 1: "spam"}
predicted_labels = [label_map[p] for p in predictions]
print("\nPredictions on New Messages (threshold 0.5):")
for msg, label in zip(new_messages, predicted_labels):
    print(f"{label.upper()}: {msg}")


Predictions on New Messages (threshold 0.5):
SPAM: Congratulations! You have won a free iPhone. Call now to claim.
HAM: Hey, are we still meeting for coffee tomorrow?
SPAM: Urgent: Your account has been suspended. Click the link to reactivate.
HAM: Reminder: Your bill payment is due tomorrow.
