## Data

In [33]:
import numpy as np
import pandas as pd

In [34]:
df = pd.DataFrame({'text':['people watch campusx', 'campusx watch campusx', 'people write comment', 'campusx write comment'], 'output': [1, 1, 0, 0]})

In [35]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


## Bag of words

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
cv = CountVectorizer()

In [38]:
bow = cv.fit_transform(df.text)

In [39]:
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}


In [40]:
print(f"The vector representation of '{df.text[0]}' is: \n",bow[0].toarray()[0])

The vector representation of 'people watch campusx' is: 
 [1 0 1 1 0]


In [41]:
print(f"The vector representation of '{df.text[1]}' is: \n", bow[1].toarray()[0])

The vector representation of 'campusx watch campusx' is: 
 [2 0 0 1 0]


In [42]:
cv.transform(["campusx watch and campus write comment"]).toarray()[0]

array([1, 1, 0, 1, 1])

## N-grams

In [43]:
# Bi-grams
bi_gram_cv = CountVectorizer(ngram_range = (2,2))

In [44]:
bi_gram_cv.fit_transform(df.text)

<4x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [45]:
bi_gram_cv.vocabulary_

{'people watch': 2,
 'watch campusx': 4,
 'campusx watch': 0,
 'people write': 3,
 'write comment': 5,
 'campusx write': 1}

In [46]:
# Both unigram and bigram
uni_bi_cv = CountVectorizer(ngram_range = (1,2))

In [47]:
bo_bi = uni_bi_cv.fit_transform(df.text)

In [48]:
uni_bi_cv.vocabulary_

{'people': 4,
 'watch': 7,
 'campusx': 0,
 'people watch': 5,
 'watch campusx': 8,
 'campusx watch': 1,
 'write': 9,
 'comment': 3,
 'people write': 6,
 'write comment': 10,
 'campusx write': 2}

In [49]:
bo_bi[0].toarray()[0]

array([1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0])

In [50]:
print(f"The vector representation of '{df.text[0]}' in bag of bi-grams is: \n", bo_bi[0].toarray()[0])

The vector representation of 'people watch campusx' in bag of bi-grams is: 
 [1 0 0 0 1 1 0 1 1 0 0]


## Tf-Idf

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [52]:
tfidf = TfidfVectorizer()

In [55]:
tfidf.fit_transform(df.text).toarray()

array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

In [56]:
tfidf.idf_

array([1.22314355, 1.51082562, 1.51082562, 1.51082562, 1.51082562])

In [57]:
tfidf.get_feature_names_out()

array(['campusx', 'comment', 'people', 'watch', 'write'], dtype=object)

In [59]:
print(f"The vector representation of '{df.text[0]}' in tf-idf is: \n", tfidf.transform(df.text).toarray()[0])

The vector representation of 'people watch campusx' in tf-idf is: 
 [0.49681612 0.         0.61366674 0.61366674 0.        ]


In [60]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  TfidfVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
 |  
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to :class:`CountVectorizer` followed by
 |  :class:`TfidfTransformer`.
 |  
 |  For an example of usage, see
 |  :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`.
 |  
 |  For an efficiency comparison of the different feature extractors, see
 |  :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
 |  
 |  For an example of documen