#Bag of Words & TF-IDF

# Import Dependencies

In [None]:
import pandas as pd
import nltk
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Read Data

In [None]:
df = pd.read_csv("tweets.csv")
df.head()

Unnamed: 0,Datetime,Tweet Id,Text,Username
0,2023-01-24 07:12:27+00:00,1617782364222140420,@slashdot Population collapse is a major risk ...,elonmusk
1,2023-01-24 03:39:33+00:00,1617728786136727556,@thomasmidleton @VivekGRamaswamy Far too much ...,elonmusk
2,2023-01-23 16:37:47+00:00,1617562247567663104,@alx That turned out to be um … inaccurate,elonmusk
3,2023-01-23 16:33:27+00:00,1617561156264263680,@JonErlichman Good thread,elonmusk
4,2023-01-23 06:13:28+00:00,1617405134098812934,@billysteeler85 @NicoleBehnam That too,elonmusk


In [None]:
tweets = df["Text"]

In [None]:
tweets.head()

0    @slashdot Population collapse is a major risk ...
1    @thomasmidleton @VivekGRamaswamy Far too much ...
2           @alx That turned out to be um … inaccurate
3                            @JonErlichman Good thread
4               @billysteeler85 @NicoleBehnam That too
Name: Text, dtype: object

# Preprocess

In [None]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"\W", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [None]:
tweets = tweets.apply(preprocess)
tweets.head()

0     slashdot population collapse is a major risk ...
1     thomasmidleton vivekgramaswamy far too much p...
2              alx that turned out to be um inaccurate
3                             jonerlichman good thread
4                 billysteeler85 nicolebehnam that too
Name: Text, dtype: object

# Bag of Words

In [None]:
tweets = df["Text"]

In [None]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"\W", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [None]:
tweets = tweets.apply(preprocess)
tweets.head()

0     slashdot population collapse is a major risk ...
1     thomasmidleton vivekgramaswamy far too much p...
2              alx that turned out to be um inaccurate
3                             jonerlichman good thread
4                 billysteeler85 nicolebehnam that too
Name: Text, dtype: object

In [None]:
vectorizer = CountVectorizer()

In [None]:
vectorizer.fit(tweets)

In [None]:
vectorizer.get_feature_names_out()

array(['10', '100', '10x', ..., 'zerohedge', 'zi8qrt2ev4', 'zoa3qmcxqc'],
      dtype=object)

In [None]:
vectorizer.vocabulary_

{'slashdot': 822,
 'population': 699,
 'collapse': 193,
 'is': 476,
 'major': 560,
 'risk': 773,
 'to': 926,
 'the': 895,
 'future': 378,
 'of': 638,
 'civilization': 182,
 'https': 431,
 'co': 187,
 'zi8qrt2ev4': 1051,
 'thomasmidleton': 913,
 'vivekgramaswamy': 989,
 'far': 329,
 'too': 928,
 'much': 610,
 'power': 705,
 'concentrated': 203,
 'in': 453,
 'hands': 409,
 'shareholder': 809,
 'services': 806,
 'companies': 201,
 'like': 536,
 'iss': 478,
 'and': 67,
 'glass': 391,
 'lewis': 529,
 'because': 108,
 'so': 825,
 'market': 567,
 'passive': 672,
 'index': 466,
 'funds': 376,
 'which': 1013,
 'outsource': 663,
 'voting': 990,
 'decisions': 243,
 'them': 898,
 'effectively': 287,
 'control': 211,
 'stock': 852,
 'alx': 58,
 'that': 894,
 'turned': 952,
 'out': 662,
 'be': 106,
 'um': 960,
 'inaccurate': 454,
 'jonerlichman': 495,
 'good': 396,
 'thread': 918,
 'billysteeler85': 129,
 'nicolebehnam': 623,
 'don': 270,
 'taste': 883,
 'or': 656,
 'effects': 288,
 'most': 602,
 'a

In [None]:
vector = vectorizer.transform(tweets)

In [None]:
print("Encoded Document is:")
print(vector.toarray())

Encoded Document is:
[[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
print([vector.toarray()[0]])
print(vectorizer.inverse_transform([vector.toarray()[0]]))

[array([0, 0, 0, ..., 0, 1, 0], dtype=int64)]
[array(['civilization', 'co', 'collapse', 'future', 'https', 'is', 'major',
       'of', 'population', 'risk', 'slashdot', 'the', 'to', 'zi8qrt2ev4'],
      dtype='<U17')]


# TF-IDF

In [None]:
tf_idf_vectorizer = TfidfVectorizer()

In [None]:
tf_idf_vectorizer.fit(tweets)

In [None]:
tf_idf_vectorizer.get_feature_names_out()

array(['10', '100', '10x', ..., 'zerohedge', 'zi8qrt2ev4', 'zoa3qmcxqc'],
      dtype=object)

In [None]:
tf_idf_vectorizer.vocabulary_

{'slashdot': 822,
 'population': 699,
 'collapse': 193,
 'is': 476,
 'major': 560,
 'risk': 773,
 'to': 926,
 'the': 895,
 'future': 378,
 'of': 638,
 'civilization': 182,
 'https': 431,
 'co': 187,
 'zi8qrt2ev4': 1051,
 'thomasmidleton': 913,
 'vivekgramaswamy': 989,
 'far': 329,
 'too': 928,
 'much': 610,
 'power': 705,
 'concentrated': 203,
 'in': 453,
 'hands': 409,
 'shareholder': 809,
 'services': 806,
 'companies': 201,
 'like': 536,
 'iss': 478,
 'and': 67,
 'glass': 391,
 'lewis': 529,
 'because': 108,
 'so': 825,
 'market': 567,
 'passive': 672,
 'index': 466,
 'funds': 376,
 'which': 1013,
 'outsource': 663,
 'voting': 990,
 'decisions': 243,
 'them': 898,
 'effectively': 287,
 'control': 211,
 'stock': 852,
 'alx': 58,
 'that': 894,
 'turned': 952,
 'out': 662,
 'be': 106,
 'um': 960,
 'inaccurate': 454,
 'jonerlichman': 495,
 'good': 396,
 'thread': 918,
 'billysteeler85': 129,
 'nicolebehnam': 623,
 'don': 270,
 'taste': 883,
 'or': 656,
 'effects': 288,
 'most': 602,
 'a

In [None]:
tf_idf_vector = tf_idf_vectorizer.transform(tweets)

In [None]:
print("Encoded Document is:")
print(tf_idf_vector.toarray())

Encoded Document is:
[[0.         0.         0.         ... 0.         0.34312765 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
print([tf_idf_vector.toarray()[0]])
print(tf_idf_vectorizer.inverse_transform([tf_idf_vector.toarray()[0]]))

[array([0.        , 0.        , 0.        , ..., 0.        , 0.34312765,
       0.        ])]
[array(['civilization', 'co', 'collapse', 'future', 'https', 'is', 'major',
       'of', 'population', 'risk', 'slashdot', 'the', 'to', 'zi8qrt2ev4'],
      dtype='<U17')]
