# Bag of words

In [1]:
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# load data
data = fetch_20newsgroups(subset='train')
df = pd.DataFrame(data.data, columns=['text'])
df.head()

Unnamed: 0,text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...


In [3]:
# remove punctuation and numbers

df['text'] = df['text'].str.replace('[^\w\s]','', regex=True).str.replace('\d+', '', regex=True)

In [4]:
# set up a bag of words transformer

vectorizer = CountVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 1),
                             min_df=0.05)

In [5]:
# transformer finds the words to be retained

vectorizer.fit(df['text'])

In [6]:
X = vectorizer.transform(df['text'])

In [7]:
# create bago of words dataframe

bagofwords = pd.DataFrame(X.toarray(),
                          columns = vectorizer.get_feature_names_out())

bagofwords.head()

Unnamed: 0,able,access,actually,ago,apr,article,articleid,ask,available,away,...,works,world,writes,wrong,wrote,xnewsreader,year,years,yes,youre
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,1,1,0,1,1,0,0,0,0
4,0,0,0,0,0,2,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [8]:
bagofwords.shape

(11314, 191)

In [9]:
# with n grams
vectorizer = CountVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 2),
                             min_df=0.1)

In [10]:
vectorizer.fit(df['text'])

In [11]:
X = vectorizer.transform(df['text'])

In [12]:
X.shape

(11314, 59)

In [13]:
bagofwords = pd.DataFrame(X.toarray(),
                          columns = vectorizer.get_feature_names_out())

bagofwords.head()

Unnamed: 0,article,believe,better,case,computer,did,distribution,does,doesnt,dont,...,use,used,using,want,way,work,world,writes,writes article,years
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,2,0,1,2,0,2,...,1,0,0,0,1,0,0,0,0,0
3,1,0,0,0,2,0,1,0,0,0,...,0,0,0,0,0,0,1,1,1,0
4,2,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [14]:
vectorizer.get_feature_names_out()

array(['article', 'believe', 'better', 'case', 'computer', 'did',
       'distribution', 'does', 'doesnt', 'dont', 'email', 'going', 'good',
       'got', 'help', 'im', 'ive', 'just', 'know', 'like', 'lines',
       'lines article', 'lines nntppostinghost', 'make', 'need', 'new',
       'nntppostinghost', 'organization', 'organization university',
       'people', 'point', 'problem', 'question', 'read', 'really',
       'replyto', 'right', 'said', 'say', 'state', 'subject', 'sure',
       'thanks', 'thing', 'things', 'think', 'time', 'university', 'usa',
       'use', 'used', 'using', 'want', 'way', 'work', 'world', 'writes',
       'writes article', 'years'], dtype=object)