In [9]:
import pandas as pd
import re
import nltk

In [15]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
data = {
    "text" : [
        "I Love NLP! It is amazing.",
        "NLP is used in chatbots and search engines.",
        "I do not like boring lectures.",
        "This NLP session is very interesting!"
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text
0,I Love NLP! It is amazing.
1,NLP is used in chatbots and search engines.
2,I do not like boring lectures.
3,This NLP session is very interesting!


In [12]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]','',text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return "".join(words)

In [14]:
df["clean_text"] = df["text"].apply(clean_text)
df

Unnamed: 0,text,clean_text
0,I Love NLP! It is amazing.,lovenlpamazing
1,NLP is used in chatbots and search engines.,nlpusedchatbotssearchengine
2,I do not like boring lectures.,likeboringlecture
3,This NLP session is very interesting!,nlpsessioninteresting


In [16]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df["clean_text"])

print("Vocabulary :\n", tfidf.get_feature_names_out())
print("\nTF-IDF Matrix :\n", X.toarray())



Vocabulary :
 ['likeboringlecture' 'lovenlpamazing' 'nlpsessioninteresting'
 'nlpusedchatbotssearchengine']

TF-IDF Matrix :
 [[0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]]


In [17]:
tfidf_df = pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df

Unnamed: 0,likeboringlecture,lovenlpamazing,nlpsessioninteresting,nlpusedchatbotssearchengine
0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0
