In [3]:
import nltk

# Download the punkt tokenizer (needed for sentence and word tokenization)
nltk.download('punkt')

# Optional: Download stopwords if you plan to remove them
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:

pip install nltk





In [16]:
import pandas as pd

# Example tweets dataset
data = {
    "Tweets": [
        "I love NLP! It's amazing.",
        "Tokenization and TF-IDF are very useful.",
        "Text preprocessing is essential for NLP tasks.",
        "Stopwords removal helps improve accuracy!",
        "Word embeddings capture semantic meaning."
    ]
}

df = pd.DataFrame(data)
print(df)


                                           Tweets
0                       I love NLP! It's amazing.
1        Tokenization and TF-IDF are very useful.
2  Text preprocessing is essential for NLP tasks.
3       Stopwords removal helps improve accuracy!
4       Word embeddings capture semantic meaning.


In [18]:
import re

# English stopwords list (can be extended)
stop_words = set([
    "i","me","my","myself","we","our","ours","ourselves","you","your","yours",
    "yourself","yourselves","he","him","his","himself","she","her","hers","herself",
    "it","its","itself","they","them","their","theirs","themselves","what","which",
    "who","whom","this","that","these","those","am","is","are","was","were","be","been",
    "being","have","has","had","having","do","does","did","doing","a","an","the","and",
    "but","if","or","because","as","until","while","of","at","by","for","with","about",
    "against","between","into","through","during","before","after","above","below","to",
    "from","up","down","in","out","on","off","over","under","again","further","then",
    "once","here","there","when","where","why","how","all","any","both","each","few",
    "more","most","other","some","such","no","nor","not","only","own","same","so",
    "than","too","very","s","t","can","will","just","don","should","now"
])

def preprocess(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"(https?://\S+)|(@\w+)|(#\w+)", "", text)  # Remove URLs/mentions/hashtags
    text = re.sub(r"[^a-z\s]", "", text)  # Keep letters only
    tokens = text.split()  # Tokenize
    filtered_tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    return " ".join(filtered_tokens)

df['cleaned'] = df['Tweets'].apply(preprocess)
print("Preprocessed Tweets:")
print(df[['Tweets', 'cleaned']])


Preprocessed Tweets:
                                           Tweets  \
0                       I love NLP! It's amazing.   
1        Tokenization and TF-IDF are very useful.   
2  Text preprocessing is essential for NLP tasks.   
3       Stopwords removal helps improve accuracy!   
4       Word embeddings capture semantic meaning.   

                                    cleaned  
0                          love nlp amazing  
1                 tokenization tfidf useful  
2    text preprocessing essential nlp tasks  
3  stopwords removal helps improve accuracy  
4  word embeddings capture semantic meaning  


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['cleaned'])

print("\nTF-IDF Feature Names:")
print(vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())



TF-IDF Feature Names:
['accuracy' 'amazing' 'capture' 'embeddings' 'essential' 'helps' 'improve'
 'love' 'meaning' 'nlp' 'preprocessing' 'removal' 'semantic' 'stopwords'
 'tasks' 'text' 'tfidf' 'tokenization' 'useful' 'word']

TF-IDF Matrix:
[[0.         0.61418897 0.         0.         0.         0.
  0.         0.61418897 0.         0.49552379 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.57735027 0.57735027
  0.57735027 0.        ]
 [0.         0.         0.         0.         0.46369322 0.
  0.         0.         0.         0.37410477 0.46369322 0.
  0.         0.         0.46369322 0.46369322 0.         0.
  0.         0.        ]
 [0.4472136  0.         0.         0.         0.         0.4472136
  0.4472136  0.         0.         0.         0.         0.4472136


In [21]:
!pip install gensim


Collecting gensim
  Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl.metadata (8.6 kB)
Collecting smart_open>=1.8.1 (from gensim)
  Downloading smart_open-7.5.0-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.4.0-cp313-cp313-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.3/24.4 MB ? eta -:--:--
    --------------------------------------- 0.5/24.4 MB 287.8 kB/s eta 0:01:23
    -----------------------------

In [23]:
from gensim.models import Word2Vec

# Prepare tokenized sentences for Word2Vec
sentences = [tweet.split() for tweet in df['cleaned']]

# Train Word2Vec model
model = Word2Vec(sentences, vector_size=50, window=2, min_count=1, workers=4)

# Example: Get vector for a word
vector = model.wv['nlp']
print("\nWord Vector for 'nlp':")
print(vector)



Word Vector for 'nlp':
[-1.0724545e-03  4.7286271e-04  1.0206699e-02  1.8018546e-02
 -1.8605899e-02 -1.4233618e-02  1.2917745e-02  1.7945977e-02
 -1.0030856e-02 -7.5267432e-03  1.4761009e-02 -3.0669428e-03
 -9.0732267e-03  1.3108104e-02 -9.7203208e-03 -3.6320353e-03
  5.7531595e-03  1.9837476e-03 -1.6570430e-02 -1.8897636e-02
  1.4623532e-02  1.0140524e-02  1.3515387e-02  1.5257311e-03
  1.2701781e-02 -6.8107317e-03 -1.8928028e-03  1.1537147e-02
 -1.5043275e-02 -7.8722071e-03 -1.5023164e-02 -1.8600845e-03
  1.9076237e-02 -1.4638334e-02 -4.6675373e-03 -3.8754821e-03
  1.6154874e-02 -1.1861792e-02  9.0324880e-05 -9.5074680e-03
 -1.9207101e-02  1.0014586e-02 -1.7519170e-02 -8.7836506e-03
 -7.0199967e-05 -5.9236289e-04 -1.5322480e-02  1.9229487e-02
  9.9641159e-03  1.8466286e-02]
