# TF-IDF

In [1]:
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# load data
data = fetch_20newsgroups(subset='train')
df = pd.DataFrame(data.data, columns=['text'])
df.head()

Unnamed: 0,text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...


In [3]:
# remove punctuation and numbers
df['text'] = df['text'].str.replace(
    '[^\w\s]', '', regex=True).str.replace('\d+', '', regex=True)

In [4]:
# set up Tf-idf transformer
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 1),
                             min_df=0.05)

In [5]:
# learn words to be retained and their frequency

vectorizer.fit(df['text'])

In [6]:
X = vectorizer.transform(df['text'])

In [7]:
# create tf-idf dataframe

tfidf = pd.DataFrame(X.toarray(),
                          columns = vectorizer.get_feature_names_out())

tfidf.head()

Unnamed: 0,able,access,actually,ago,apr,article,articleid,ask,available,away,...,works,world,writes,wrong,wrote,xnewsreader,year,years,yes,youre
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27302,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.356469,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.135765,0.123914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.110035,0.0,0.0,0.0,0.0,...,0.0,0.169635,0.100554,0.0,0.218197,0.233578,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.262692,0.0,0.0,0.0,0.0,...,0.0,0.0,0.120029,0.0,0.0,0.0,0.0,0.0,0.264836,0.0


In [8]:
tfidf.shape

(11314, 191)

In [9]:
# with n grams
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 2),
                             min_df=0.1)

In [10]:
vectorizer.fit(df['text'])

In [11]:
X = vectorizer.transform(df['text'])

In [12]:
tfidf = pd.DataFrame(X.toarray(),
                          columns = vectorizer.get_feature_names_out())

tfidf.head()

Unnamed: 0,article,believe,better,case,computer,did,distribution,does,doesnt,dont,...,use,used,using,want,way,work,world,writes,writes article,years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.374196
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.186832,0.0,0.34948,0.0,0.142464,0.284095,0.0,0.259031,...,0.153118,0.0,0.0,0.0,0.158175,0.0,0.0,0.0,0.0,0.0
3,0.168872,0.0,0.0,0.0,0.5551,0.0,0.226284,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.260342,0.154322,0.301699,0.0
4,0.329602,0.0,0.0,0.0,0.0,0.0,0.220829,0.0,0.0,0.200758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150602,0.0,0.0


In [13]:
vectorizer.get_feature_names_out()

array(['article', 'believe', 'better', 'case', 'computer', 'did',
       'distribution', 'does', 'doesnt', 'dont', 'email', 'going', 'good',
       'got', 'help', 'im', 'ive', 'just', 'know', 'like', 'lines',
       'lines article', 'lines nntppostinghost', 'make', 'need', 'new',
       'nntppostinghost', 'organization', 'organization university',
       'people', 'point', 'problem', 'question', 'read', 'really',
       'replyto', 'right', 'said', 'say', 'state', 'subject', 'sure',
       'thanks', 'thing', 'things', 'think', 'time', 'university', 'usa',
       'use', 'used', 'using', 'want', 'way', 'work', 'world', 'writes',
       'writes article', 'years'], dtype=object)