In [3]:
# Step 1: Import Libraries !pip install nltk gensim
import nltk, re, gensim
from nltk.corpus import stopwords
nltk.download('punkt') #'punkt' — sentence/word tokenizer models.
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pradnya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pradnya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
# gensim chosen because it has an efficient Word2Vec implementation

In [8]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Pradnya\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [36]:
# Step 2: Input Paragraph
text = """"I love playing football with my friends. 
We often meet at the ground in the evening. 
After playing, we talk about our favorite teams and players. 
Sports help us stay healthy and energetic."""

In [37]:
# Step 3: Preprocessing
text = text.lower()
sentences = nltk.sent_tokenize(text)
cleaned = []

for s in sentences:
    s = re.sub('[^a-z0-9 ]+', '', s)   # remove special characters
    cleaned.append(s)

In [38]:
# Step 4: Tokenization and Stopword Removal
tokens = [nltk.word_tokenize(s) for s in cleaned]
sw = set(stopwords.words('english'))
for i in range(len(tokens)):
    tokens[i] = [w for w in tokens[i] if w not in sw]

print("Cleaned Tokens:", tokens)

Cleaned Tokens: [['love', 'playing', 'football', 'friends'], ['often', 'meet', 'ground', 'evening'], ['playing', 'talk', 'favorite', 'teams', 'players'], ['sports', 'help', 'us', 'stay', 'healthy', 'energetic']]


In [39]:
# Step 5: Train Word2Vec Model (CBOW)
model = gensim.models.Word2Vec(tokens, vector_size=50, window=5, min_count=1, sg=0)


# min_count — setting to 1 includes rare tokens (might add noise). In real training, set min_count to 2–5 to ignore typos/rare tokens.
# CBOW predicts a target word from its surrounding context (averaging context vector to predict target). 

In [42]:
# Step 6: Generate Context–Target Pairs (for reference)
data = []  # List to store (context, target) pairs
window_size = 2  # Two words before and after

for sentence in tokens:
    if len(sentence) < 2 * window_size + 1:
        # Skip short sentences that don't have enough words for full context
        continue

    # Loop through each word, skipping the first and last two words
    for i in range(window_size, len(sentence) - window_size):
        # Context: two words before + two words after
        context = [
            sentence[i - 2],
            sentence[i - 1],
            sentence[i + 1],
            sentence[i + 2]
        ]
        # Target: the current word
        target = sentence[i]

        data.append((context, target))

# Display some context-target examples
print("\nExample context-target pairs:")
for i in range(3):
    print(f"Context: {data[i][0]} → Target: {data[i][1]}")


Example context-target pairs:
Context: ['playing', 'talk', 'teams', 'players'] → Target: favorite
Context: ['sports', 'help', 'stay', 'healthy'] → Target: us
Context: ['help', 'us', 'healthy', 'energetic'] → Target: stay
