<a href="https://colab.research.google.com/github/Sahilarora24/NLP/blob/main/Module%201/Word2Vec%26AvgWord2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
!pip install chardet



# *Code with Bag of Words*

In [2]:
messages = pd.read_csv("/content/drive/MyDrive/data/spam.csv",encoding='latin1')

In [3]:
messages.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace= True)

In [4]:
messages.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)
messages.sample(5)

Unnamed: 0,label,message
1658,spam,RGENT! This is the 2nd attempt to contact U!U ...
2800,ham,Depends on where u going lor.
3076,ham,There is no sense in my foot and penis.
2349,ham,Yar else i'll thk of all sorts of funny things.
3692,ham,I was about to do it when i texted. I finished...


In [5]:
messages['message'][1]

'Ok lar... Joking wif u oni...'

In [6]:
#Data Cleaning and Pre processing

In [7]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [9]:
corpus = []

# Loop through each message in the dataset
for i in range(0, len(messages)):

    # Step 1: Remove all characters except alphabets (replace them with space)
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])

    # Step 2: Convert the entire message to lowercase
    review = review.lower()

    # Step 3: Split the message into individual words (tokens)
    review = review.split()

    # Step 4: Remove stopwords (like 'the', 'is', 'and') and apply stemming
    # Stemming reduces words to their root form (e.g., "running" → "run")
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]

    # Step 5: Join the processed words back into a single string
    review = ' '.join(review)

    # Step 6: Add the cleaned message to the corpus
    corpus.append(review)


In [10]:
corpus[1]

'ok lar joke wif u oni'

In [11]:
# CountVectorizer is a tool from sklearn that converts text into numbers using the Bag of Words (BoW) model.
#
# It turns your text data (the corpus) into a big table (called a document-term matrix) where:
# - Rows = each message
# - Columns = each unique word (from the entire corpus)
# - Values = how often that word appears in that message
#
# Parameters explained:
# max_features=2500
# - This tells the vectorizer to keep only the top 2,500 most frequent words from the entire corpus.
# - If your data has 10,000 unique words, this will cut it down to the most important 2,500 based on frequency.
# - Helps reduce memory usage and noise.
#
# binary=True
# - Instead of counting how many times a word appears, it just marks:
#     1 → if the word exists in the message
#     0 → if it doesn't
# - This is useful when you only care whether a word appears, not how many times.
#
# Example:
# corpus = ["i love machine learning", "machine learning is fun"]
# Vocabulary (words): ['fun', 'is', 'learning', 'love', 'machine']
#
# Then the matrix X will look like:
#
# Message                          fun  is  learning  love  machine
# "i love machine learning"        0    0     1        1      1
# "machine learning is fun"        1    1     1        0      1

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,binary=True)
X=cv.fit_transform(corpus).toarray()

In [12]:
X[1]

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
X.shape

(5572, 2500)

In [14]:
# Convert the text labels (like 'ham' and 'spam') into numeric format using one-hot encoding
# For example: 'ham' → [1, 0], 'spam' → [0, 1]
y = messages['label'].map({'ham': 0, 'spam': 1})

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
# Split the feature matrix (X) and labels (y) into:
# - 80% training data (X_train, y_train)
# - 20% testing data (X_test, y_test)
# random_state=42 ensures the split is reproducible
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Create the model
model = MultinomialNB()

# Step 2: Train the model
model.fit(X_train, y_train)

# Step 3: Make predictions on test data
y_pred = model.predict(X_test)

# Step 4: Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.979372197309417
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.93      0.92      0.92       150

    accuracy                           0.98      1115
   macro avg       0.96      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[954  11]
 [ 12 138]]


# *Code with TF-IDF*

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=2500)
X = tfidf.fit_transform(corpus).toarray()

In [19]:
# Convert the text labels (like 'ham' and 'spam') into numeric format using one-hot encoding
# For example: 'ham' → [1, 0], 'spam' → [0, 1]
y = messages['label'].map({'ham': 0, 'spam': 1})

In [20]:
# Split the feature matrix (X) and labels (y) into:
# - 80% training data (X_train, y_train)
# - 20% testing data (X_test, y_test)
# random_state=42 ensures the split is reproducible
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Create the model
model = MultinomialNB()

# Step 2: Train the model
model.fit(X_train, y_train)

# Step 3: Make predictions on test data
y_pred = model.predict(X_test)

# Step 4: Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.97847533632287
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[965   0]
 [ 24 126]]


# *Changing Model and Try*

In [22]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Create the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 2: Train the model
model.fit(X_train, y_train)

# Step 3: Make predictions
y_pred = model.predict(X_test)

# Step 4: Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9802690582959641
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.86      0.92       150

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[964   1]
 [ 21 129]]


# *Word2Vec Implementation*

Word2Vec is a neural embedding model that converts words into dense vector representations that capture meaning. Unlike BoW or TF-IDF (which only count word occurrences), Word2Vec understands semantic relationships for example, the vectors for "king" and "queen" will be close together.



In [23]:
!pip install gensim
from gensim.models import Word2Vec



In [24]:


# Train Word2Vec model on the corpus
model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1)




In [25]:
# vector_size=100
# Each word will be represented as a 100-dimensional vector.
# Example: "king" → [0.25, -0.13, 0.08, ..., 0.91]  # total 100 values
# Similar words like "king" and "queen" will have similar vectors.
# Unrelated words like "king" and "banana" will have very different vectors.
# ✅ Higher vector_size can capture more meaning, but needs more data.

# window=5
# This defines the context window size — how many words before and after the target word are considered.
# Example with window=2: Sentence = "I love natural language processing"
# For the center word "natural", the context would be: ["love", "language"]
# ✅ Larger window → captures broader meaning (semantics)
# ✅ Smaller window → captures nearby structure (syntax)

# min_count=1
# Only include words that appear at least this many times in the corpus.
# Example corpus: ["apple banana apple", "banana orange", "grape"]
# Word counts: apple=2, banana=2, orange=1, grape=1
# ✔️ If min_count=1 → includes all: ["apple", "banana", "orange", "grape"]
# ❌ If min_count=2 → excludes rare words: ["apple", "banana"]
# ✅ Set to 1 for small datasets to keep all words.
# ✅ Set higher (e.g. 5) for large datasets to remove noise.


In [29]:
import numpy as np

# ✅ Function: Convert a sentence (list of words) into a single vector
def get_sentence_vector(words, model):
    # Get vector for each word if it's present in the model's vocabulary
    vectors = [model.wv[word] for word in words if word in model.wv]

    # 🔁 Example:
    # Let's say words = ["king", "rules", "dragon"]
    # If all are in the model:
    # model.wv["king"] = [0.2, 0.5, ..., 0.8]  # 100-dim
    # model.wv["rules"] = [0.1, 0.4, ..., 0.6]
    # model.wv["dragon"] = [0.3, 0.2, ..., 0.9]
    # ➡ vectors = [vec_king, vec_rules, vec_dragon]

    # Return the average of all word vectors → sentence vector
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# ✅ Convert the entire corpus (list of tokenized sentences) to sentence vectors
# Each sentence becomes a single 100-dim vector (if vector_size=100)

# 💬 Example corpus:
# corpus = [["i", "love", "nlp"], ["king", "rules", "kingdom"], ["dragon", "queen"]]

# For each sentence, call get_sentence_vector → average word vectors → 1 vector per sentence
X = np.array([get_sentence_vector(msg, model) for msg in corpus])

# 🔁 Example output:
# X.shape = (3, 100)  # 3 sentences, each represented by a 100-dim vector


In [30]:
y = pd.get_dummies(messages['label'])['spam']  # 1 if spam, 0 if ham

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

clf = LogisticRegression()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8681614349775785
Report:
               precision    recall  f1-score   support

       False       0.87      1.00      0.93       965
        True       0.64      0.05      0.09       150

    accuracy                           0.87      1115
   macro avg       0.75      0.52      0.51      1115
weighted avg       0.84      0.87      0.82      1115

