In [80]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

In [81]:
sent1 = 'It was a good practice for us.'
sent2 = 'It was also good to know about it.'

In [82]:
s1_t = word_tokenize(sent1.lower())
s2_t = word_tokenize(sent2.lower())

In [83]:
final_sent = s1_t + s2_t
final_sent

['it',
 'was',
 'a',
 'good',
 'practice',
 'for',
 'us',
 '.',
 'it',
 'was',
 'also',
 'good',
 'to',
 'know',
 'about',
 'it',
 '.']

In [84]:
final_sent = set(final_sent)
final_sent

{'.',
 'a',
 'about',
 'also',
 'for',
 'good',
 'it',
 'know',
 'practice',
 'to',
 'us',
 'was'}

In [85]:
df = pd.DataFrame({}, index=[1,2], columns=list(final_sent))
df

Unnamed: 0,us,for,practice,about,.,also,was,to,it,know,a,good
1,,,,,,,,,,,,
2,,,,,,,,,,,,


In [86]:
counts1 = [s1_t.count(x) for  x in df.columns]
counts2 = [s2_t.count(x) for  x in df.columns]

In [87]:
counts1

[1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]

In [88]:
counts2

[0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 0, 1]

In [91]:
df.iloc[0,:] = counts1
df.iloc[1,:] = counts2

df

Unnamed: 0,us,for,practice,about,.,also,was,to,it,know,a,good
1,1,1,1,0,1,0,1,0,1,0,1,1
2,0,0,0,1,1,1,1,1,2,1,0,1


In [115]:
from sklearn.feature_extraction.text import CountVectorizer

In [116]:
cvt = CountVectorizer()

In [117]:
new_data = cvt.fit_transform([sent1, sent2])

In [118]:
new_data

<2x10 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [119]:
sparse_elems = new_data.toarray()
sparse_elems

array([[0, 0, 1, 1, 1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 2, 1, 0, 1, 0, 1]], dtype=int64)

In [120]:
col_names = cvt.get_feature_names_out()
col_names

array(['about', 'also', 'for', 'good', 'it', 'know', 'practice', 'to',
       'us', 'was'], dtype=object)

In [122]:
df_cvt = pd.DataFrame(sparse_elems, columns=col_names)
df_cvt

Unnamed: 0,about,also,for,good,it,know,practice,to,us,was
0,0,0,1,1,1,0,1,0,1,1
1,1,1,0,1,2,1,0,1,0,1


In [138]:
cvt = CountVectorizer(ngram_range=(1,2))

In [139]:
new_ngrams = cvt.fit_transform([sent1, sent2])
new_ngrams

<2x21 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [144]:
sparse_ngrams = new_ngrams.toarray()
sparse_ngrams

array([[0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1],
       [1, 1, 1, 1, 0, 0, 1, 0, 1, 2, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0]],
      dtype=int64)

In [141]:
col_ngrams = cvt.get_feature_names_out()
col_ngrams

array(['about', 'about it', 'also', 'also good', 'for', 'for us', 'good',
       'good practice', 'good to', 'it', 'it was', 'know', 'know about',
       'practice', 'practice for', 'to', 'to know', 'us', 'was',
       'was also', 'was good'], dtype=object)

In [142]:
df_ngrams = pd.DataFrame(data=sparse_ngrams, columns=col_ngrams)
df_ngrams

Unnamed: 0,about,about it,also,also good,for,for us,good,good practice,good to,it,...,know,know about,practice,practice for,to,to know,us,was,was also,was good
0,0,0,0,0,1,1,1,1,0,1,...,0,0,1,1,0,0,1,1,0,1
1,1,1,1,1,0,0,1,0,1,2,...,1,1,0,0,1,1,0,1,1,0


In [156]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object
tfidf = TfidfVectorizer()

# Fit the vectorizer to the data and transform it
tfidf_matrix = tfidf.fit_transform([sent1,sent2]).toarray()

col_tfidf = tfidf.get_feature_names_out()

In [157]:
df_tfidf = pd.DataFrame(data=tfidf_matrix, columns=col_tfidf)
df_tfidf

Unnamed: 0,about,also,for,good,it,know,practice,to,us,was
0,0.0,0.0,0.470426,0.334712,0.334712,0.0,0.470426,0.0,0.470426,0.334712
1,0.376957,0.376957,0.0,0.268208,0.536416,0.376957,0.0,0.376957,0.0,0.268208
