<a href="https://colab.research.google.com/github/RoushanKhalid/NLP_101/blob/main/2_Vectorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
%pip install matplotlib seaborn scikit-learn pandas numpy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder



# One Hot Encoding

In [24]:
# Sample sentences
sentences = [
    "I love NLP",
    "NLP is fun",
    "I love Python"
]

# Step 1: Lowercase and split words
words = []
for sentence in sentences:
    words += sentence.lower().split()

# Step 2: Get unique words (vocabulary)
vocab = sorted(set(words))
print("Vocabulary:", vocab)

# Step 3: One-hot encode each sentence
for sentence in sentences:
    tokens = sentence.lower().split()
    vector = [1 if word in tokens else 0 for word in vocab]
    print(f"Sentence: {sentence}")
    print(f"One-hot: {vector}")
    print()


Vocabulary: ['fun', 'i', 'is', 'love', 'nlp', 'python']
Sentence: I love NLP
One-hot: [0, 1, 0, 1, 1, 0]

Sentence: NLP is fun
One-hot: [1, 0, 1, 0, 1, 0]

Sentence: I love Python
One-hot: [0, 1, 0, 1, 0, 1]



sklearn.preprocessing.OneHotEncoder

In [25]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

sentences = [
    "I love NLP",
    "NLP is fun",
    "I love Python"
]

# Step 1: Split each sentence into words
corpus = [sentence.lower().split() for sentence in sentences]

# Step 2: Flatten to a 2D list of words
flat_words = [[word] for sentence in corpus for word in sentence]

# Step 3: Apply OneHotEncoder (use sparse_output=False for new sklearn)
encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = encoder.fit_transform(flat_words)

# Step 4: Print results
print("Vocabulary:", encoder.categories_)
print("One-Hot Encoded Words:")
for word, vec in zip(flat_words, onehot_encoded):
    print(f"{word[0]:<10} => {vec}")

Vocabulary: [array(['fun', 'i', 'is', 'love', 'nlp', 'python'], dtype=object)]
One-Hot Encoded Words:
i          => [0. 1. 0. 0. 0. 0.]
love       => [0. 0. 0. 1. 0. 0.]
nlp        => [0. 0. 0. 0. 1. 0.]
nlp        => [0. 0. 0. 0. 1. 0.]
is         => [0. 0. 1. 0. 0. 0.]
fun        => [1. 0. 0. 0. 0. 0.]
i          => [0. 1. 0. 0. 0. 0.]
love       => [0. 0. 0. 1. 0. 0.]
python     => [0. 0. 0. 0. 0. 1.]


One-Hot Encoding with pandas.get_dummies

In [26]:
import pandas as pd

# Step 1: Flatten all words
words = [word for sentence in sentences for word in sentence.lower().split()]

# Step 2: Create a DataFrame
df = pd.DataFrame({'word': words})

# Step 3: Apply get_dummies
onehot_df = pd.get_dummies(df['word'])

print(onehot_df)


     fun      i     is   love    nlp  python
0  False   True  False  False  False   False
1  False  False  False   True  False   False
2  False  False  False  False   True   False
3  False  False  False  False   True   False
4  False  False   True  False  False   False
5   True  False  False  False  False   False
6  False   True  False  False  False   False
7  False  False  False   True  False   False
8  False  False  False  False  False    True


# Bag of Words (Bow)

In [27]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data

Unnamed: 0,label,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [28]:
data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
data = data.rename(columns={'v1': 'label', 'v2': 'message'})

In [29]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


Data Cleaning and Preprocessing

In [30]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk import download
download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer('english')

In [32]:
print("Columns in data:", data.columns)

Columns in data: Index(['label', 'message'], dtype='object')


In [33]:
corpus = []
for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['message'][i])
    review = review.lower()
    review = review.split()

    # Stemming
    review = [porter_stemmer.stem(word) for word in review if not word in set(stopwords.words('english'))]

    # Join words back to a single string
    review = ' '.join(review)

    corpus.append(review)