In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
corpus = [
        'This is the first document.',
        'This document is the second document.',
        'And this is the third one.',
        'Is this the first document?',
    ]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(X.shape)
print(vectorizer.get_feature_names_out())

(4, 9)
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


In [3]:
mydf = pd.DataFrame(X.toarray(), columns=[vectorizer.get_feature_names_out()])

In [4]:
# By default, TfidfVectorizer applies L2 normalization, 
# which scales the values so that the sum of squares of the 
# TF-IDF values for each document vector is 1.

In [5]:
mydf

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import pandas as pd

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

# Initialize TfidfVectorizer to get IDF values
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(corpus)

# Get IDF values for each word
idf_values = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))
print("IDF values:", idf_values)

# Initialize CountVectorizer to calculate raw term frequencies (TF)
count_vectorizer = CountVectorizer()
word_count = count_vectorizer.fit_transform(corpus)

# Convert word counts to an array and create a DataFrame to visualize TF values
tf_values = word_count.toarray()
tf_df = pd.DataFrame(tf_values, columns=count_vectorizer.get_feature_names_out())
print("\nTF values (raw counts):")
print(tf_df)



IDF values: {'and': np.float64(1.916290731874155), 'document': np.float64(1.2231435513142097), 'first': np.float64(1.5108256237659907), 'is': np.float64(1.0), 'one': np.float64(1.916290731874155), 'second': np.float64(1.916290731874155), 'the': np.float64(1.0), 'third': np.float64(1.916290731874155), 'this': np.float64(1.0)}

TF values (raw counts):
   and  document  first  is  one  second  the  third  this
0    0         1      1   1    0       0    1      0     1
1    0         2      0   1    0       1    1      0     1
2    1         0      0   1    1       0    1      1     1
3    0         1      1   1    0       0    1      0     1


In [7]:

# Normalize TF values to get relative term frequencies
tf_normalized = tf_df.div(tf_df.sum(axis=1), axis=0)

tf_normalized = pd.DataFrame(tf_normalized, columns=count_vectorizer.get_feature_names_out())

print("\nNormalized TF values (relative frequencies):")
print(tf_normalized)


Normalized TF values (relative frequencies):
        and  document  first        is       one    second       the  \
0  0.000000  0.200000    0.2  0.200000  0.000000  0.000000  0.200000   
1  0.000000  0.333333    0.0  0.166667  0.000000  0.166667  0.166667   
2  0.166667  0.000000    0.0  0.166667  0.166667  0.000000  0.166667   
3  0.000000  0.200000    0.2  0.200000  0.000000  0.000000  0.200000   

      third      this  
0  0.000000  0.200000  
1  0.000000  0.166667  
2  0.166667  0.166667  
3  0.000000  0.200000  
