<a href="https://colab.research.google.com/github/RyuichiSaito1/covid19-twitter-usa-restoring/blob/main/tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# New York City

In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re

# Function to filter out non-English words
def is_english_word(word):
    return bool(re.match('^[a-zA-Z]+$', word))

# Words to be excluded from TF-IDF calculation
exclude_words = ['january', 'jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','june', 'jun', 'july', 'jul', 'august', 'aug', 'september', 'sep', 'october', 'oct', 'november', 'nov','december', 'dec', 'holiday', 'holidays']

# Function to filter out specified words
def exclude_specific_words(text):
    words = text.split()
    return [word for word in words if is_english_word(word) and word.lower() not in exclude_words]

# min_df=0.10: Exclude the bottom 10% of low-frequency terms
# max_df=0.90: Exclude the top 90% of high-frequency terms
# ngram_range=(1, 2): Include phrases of up to 2 characters
count_vectorizer = CountVectorizer(stop_words='english', input='filename', ngram_range=(1, 1), max_df=0.99)

# Corrected path for the files
base_path = '/content/drive/My Drive/covid-twitter-usa-normal/result/gpt-3.5-turbo/new_york_city/tf-idf/total_period_text/'
files = [base_path + content for content in sorted(os.listdir(base_path))]

# Filter out non-English words and specified words from the features
count_vectorizer.set_params(tokenizer=lambda text: exclude_specific_words(text))

feature_vectors = count_vectorizer.fit_transform(files)
print("Feature Vectors Shape:", feature_vectors.shape)

terms = count_vectorizer.get_feature_names_out()
print("Number of Terms:", len(terms))

# Setting sublinear_tf to True results in logarithmic scaling
# Setting norm='l2' normalizes the word vectors to have a length of 1 using cosine normalization
tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False)

# Transforming feature vectors into TF-IDF representation
tfidf = tfidf_transformer.fit_transform(feature_vectors)

# Printing the TF-IDF matrix
print(tfidf)

# Converting the TF-IDF matrix to a NumPy array
tfidfs = tfidf.toarray()

# Function to extract top-N feature words for a given document
def extract_feature_words(terms, tfidfs, i, n):
    tfidf_array = tfidfs[i]
    top_n_idx = tfidf_array.argsort()[-n:][::-1]
    words = [terms[idx] for idx in top_n_idx]
    return words

# Looping through the files to extract and print top-20 feature words for each document
for i in range(len(files)):
    print('------------------------------------------')
    feature_words = extract_feature_words(terms, tfidfs, i, 20)
    print(files[i])
    print(feature_words)


Feature Vectors Shape: (7, 30798)
Number of Terms: 30798
  (0, 30789)	0.010833078981565639
  (0, 30781)	0.014999750304743077
  (0, 30777)	0.04249884843227921
  (0, 30767)	0.008883279734226925
  (0, 30765)	0.012853646387535644
  (0, 30736)	0.010833078981565639
  (0, 30733)	0.014999750304743077
  (0, 30731)	0.014166282810759738
  (0, 30719)	0.014166282810759738
  (0, 30683)	0.014166282810759738
  (0, 30671)	0.010833078981565639
  (0, 30667)	0.007499875152371539
  (0, 30637)	0.014166282810759738
  (0, 30636)	0.007499875152371539
  (0, 30630)	0.006426823193767822
  (0, 30628)	0.014166282810759738
  (0, 30627)	0.014166282810759738
  (0, 30607)	0.005550075905032825
  (0, 30583)	0.014166282810759738
  (0, 30574)	0.008883279734226925
  (0, 30566)	0.01776655946845385
  (0, 30565)	0.010833078981565639
  (0, 30560)	0.010833078981565639
  (0, 30551)	0.01776655946845385
  (0, 30540)	0.007499875152371539
  :	:
  (6, 68)	0.011274720146208622
  (6, 67)	0.002557503401098938
  (6, 66)	0.0056373600731043

# Los Angeles

In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re

# Function to filter out non-English words
def is_english_word(word):
    return bool(re.match('^[a-zA-Z]+$', word))

# Words to be excluded from TF-IDF calculation
exclude_words = ['january', 'jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','june', 'jun', 'july', 'jul', 'august', 'aug', 'september', 'sep', 'october', 'oct', 'november', 'nov','december', 'dec', 'holiday', 'holidays']

# Function to filter out specified words
def exclude_specific_words(text):
    words = text.split()
    return [word for word in words if is_english_word(word) and word.lower() not in exclude_words]

# min_df=0.10: Exclude the bottom 10% of low-frequency terms
# max_df=0.90: Exclude the top 90% of high-frequency terms
# ngram_range=(1, 2): Include phrases of up to 2 characters
count_vectorizer = CountVectorizer(stop_words='english', input='filename', ngram_range=(1, 1), max_df=0.99)

# Corrected path for the files
base_path = '/content/drive/My Drive/covid-twitter-usa-normal/result/gpt-3.5-turbo/los_angeles/tf-idf/total_period_text/'
files = [base_path + content for content in sorted(os.listdir(base_path))]

# Filter out non-English words and specified words from the features
count_vectorizer.set_params(tokenizer=lambda text: exclude_specific_words(text))

feature_vectors = count_vectorizer.fit_transform(files)
print("Feature Vectors Shape:", feature_vectors.shape)

terms = count_vectorizer.get_feature_names_out()
print("Number of Terms:", len(terms))

# Setting sublinear_tf to True results in logarithmic scaling
# Setting norm='l2' normalizes the word vectors to have a length of 1 using cosine normalization
tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False)

# Transforming feature vectors into TF-IDF representation
tfidf = tfidf_transformer.fit_transform(feature_vectors)

# Printing the TF-IDF matrix
print(tfidf)

# Converting the TF-IDF matrix to a NumPy array
tfidfs = tfidf.toarray()

# Function to extract top-N feature words for a given document
def extract_feature_words(terms, tfidfs, i, n):
    tfidf_array = tfidfs[i]
    top_n_idx = tfidf_array.argsort()[-n:][::-1]
    words = [terms[idx] for idx in top_n_idx]
    return words

# Looping through the files to extract and print top-20 feature words for each document
for i in range(len(files)):
    print('------------------------------------------')
    feature_words = extract_feature_words(terms, tfidfs, i, 20)
    print(files[i])
    print(feature_words)


Feature Vectors Shape: (8, 27485)
Number of Terms: 27485
  (0, 27475)	0.015202540127270242
  (0, 27470)	0.015202540127270242
  (0, 27465)	0.015202540127270242
  (0, 27463)	0.011192001048667065
  (0, 27460)	0.005596000524333533
  (0, 27450)	0.015202540127270242
  (0, 27436)	0.007257091540257154
  (0, 27421)	0.015202540127270242
  (0, 27404)	0.014514183080514309
  (0, 27393)	0.011780621677466342
  (0, 27386)	0.015202540127270242
  (0, 27374)	0.006357009254001286
  (0, 27372)	0.007257091540257154
  (0, 27368)	0.015202540127270242
  (0, 27358)	0.00835870322766244
  (0, 27355)	0.015202540127270242
  (0, 27343)	0.04476800419466826
  (0, 27341)	0.015202540127270242
  (0, 27333)	0.007257091540257154
  (0, 27322)	0.00977892770380519
  (0, 27310)	0.030405080254540483
  (0, 27291)	0.015202540127270242
  (0, 27276)	0.015202540127270242
  (0, 27261)	0.015202540127270242
  (0, 27259)	0.011780621677466342
  :	:
  (7, 109)	0.005200864248250507
  (7, 108)	0.020285624246143767
  (7, 102)	0.0062654532235

# Chicago

In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re

# Function to filter out non-English words
def is_english_word(word):
    return bool(re.match('^[a-zA-Z]+$', word))

# Words to be excluded from TF-IDF calculation
exclude_words = ['january', 'jan', 'february', 'feb', 'march', 'mar', 'april', 'apr', 'may','june', 'jun', 'july', 'jul', 'august', 'aug', 'september', 'sep', 'october', 'oct', 'november', 'nov','december', 'dec', 'holiday', 'holidays']

# Function to filter out specified words
def exclude_specific_words(text):
    words = text.split()
    return [word for word in words if is_english_word(word) and word.lower() not in exclude_words]

# min_df=0.10: Exclude the bottom 10% of low-frequency terms
# max_df=0.90: Exclude the top 90% of high-frequency terms
# ngram_range=(1, 2): Include phrases of up to 2 characters
count_vectorizer = CountVectorizer(stop_words='english', input='filename', ngram_range=(1, 1), max_df=0.99)

# Corrected path for the files
base_path = '/content/drive/My Drive/covid-twitter-usa-normal/result/gpt-3.5-turbo/chicago/tf-idf/total_period_text/'
files = [base_path + content for content in sorted(os.listdir(base_path))]

# Filter out non-English words and specified words from the features
count_vectorizer.set_params(tokenizer=lambda text: exclude_specific_words(text))

feature_vectors = count_vectorizer.fit_transform(files)
print("Feature Vectors Shape:", feature_vectors.shape)

terms = count_vectorizer.get_feature_names_out()
print("Number of Terms:", len(terms))

# Setting sublinear_tf to True results in logarithmic scaling
# Setting norm='l2' normalizes the word vectors to have a length of 1 using cosine normalization
tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False)

# Transforming feature vectors into TF-IDF representation
tfidf = tfidf_transformer.fit_transform(feature_vectors)

# Printing the TF-IDF matrix
print(tfidf)

# Converting the TF-IDF matrix to a NumPy array
tfidfs = tfidf.toarray()

# Function to extract top-N feature words for a given document
def extract_feature_words(terms, tfidfs, i, n):
    tfidf_array = tfidfs[i]
    top_n_idx = tfidf_array.argsort()[-n:][::-1]
    words = [terms[idx] for idx in top_n_idx]
    return words

# Looping through the files to extract and print top-20 feature words for each document
for i in range(len(files)):
    print('------------------------------------------')
    feature_words = extract_feature_words(terms, tfidfs, i, 20)
    print(files[i])
    print(feature_words)


Feature Vectors Shape: (11, 19519)
Number of Terms: 19519
  (0, 19512)	0.015476798503663507
  (0, 19510)	0.025679850084100737
  (0, 19496)	0.012254099186825892
  (0, 19485)	0.018109187075582317
  (0, 19480)	0.01751246759485683
  (0, 19467)	0.018109187075582317
  (0, 19465)	0.018109187075582317
  (0, 19449)	0.0063990112980694715
  (0, 19443)	0.018109187075582317
  (0, 19441)	0.007026739599205371
  (0, 19433)	0.007738399251831753
  (0, 19420)	0.007738399251831753
  (0, 19413)	0.018109187075582317
  (0, 19404)	0.018109187075582317
  (0, 19397)	0.007026739599205371
  (0, 19391)	0.014415037916790002
  (0, 19388)	0.009531638357077925
  (0, 19384)	0.0063990112980694715
  (0, 19383)	0.010720888757997687
  (0, 19379)	0.014415037916790002
  (0, 19377)	0.014415037916790002
  (0, 19371)	0.018109187075582317
  (0, 19365)	0.03199505649034736
  (0, 19351)	0.018109187075582317
  (0, 19343)	0.014415037916790002
  :	:
  (10, 84)	0.011339966802819498
  (10, 82)	0.0072894644536043735
  (10, 80)	0.00857491