# TF–IDF Grapper

In [1]:
from pathlib import Path

#append each text file name to the list
all_txt_files =[]
for file in Path('...txt').rglob("*.txt"):
    all_txt_files.append(file.parent / file.name)

#counts the length of the list
n_files = len(all_txt_files)
print(n_files)

366


In [2]:
#sorting file in ascending numerical order
all_txt_files.sort()

In [None]:
#print first to see if it works
all_txt_files[0]

In [3]:
#converting each .txt file to string
all_docs= []
for txt_file in all_txt_files:
    with open(txt_file) as f:
        txt_file_as_string = f.read()
    all_docs.append(txt_file_as_string)

`all_doc` is now a string containing all text from .txt file

In [4]:
#import the TfidfVectorizer from Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Tokenization and removing punctuation will happened automatically when using TfidVectorizer
#when converting strings to tf-idf scores
#TfidfVectorizer is a class
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=[], use_idf=True, norm=None )

#fit_trasform() method is then run on list of strings
transformed_documents = vectorizer.fit_transform(all_docs)

In [5]:
print(vectorizer.stop_words_)

{'its', 'began', 'into', 'only', 'went', 'three', 'it', 'early', 'him', 'on', 'died', 'other', 'during', 'two', 'through', 'this', 'more', 'had', 'so', 'work', 'before', 'have', 'their', 'because', 'long', 'no', 'or', 'said', 'many', 'which', 'his', 'in', 'with', 'own', 'that', 'all', 'took', 'made', 'then', 'they', 'than', 'war', 'until', 'for', 'an', 'who', 'there', 'has', 'about', 'came', 'are', 'where', 'was', 'some', 'of', 'what', 'is', 'man', 'from', 'were', 'as', 'at', 'up', 'first', 'by', 'home', 'to', 'new', 'and', 'never', 'over', 'great', 'united', 'years', 'became', 'later', 'also', 'father', 'old', 'the', 'american', 'be', 'year', 'them', 'mr', 'been', 'most', 'when', 'time', 'while', 'out', 'last', 'could', 'world', 'but', 'called', 'would', 'after', 'life', 'one', 'death', 'york', 'much', 'born', 'he', 'did', 'will', 'not'}


⬆️⬆️`fit_transform()` method converts the list of strings to tf-idf **sparse matrix** (matrix with few zeros) then use `toarray()` method to convert sprase matrix to numpy array.

In [6]:
transformed_documents_as_array = transformed_documents.toarray()

#verifing if numpy array represents the same amount of documents
len(transformed_documents_as_array)

366

In [7]:
import pandas as pd
pd.DataFrame(transformed_documents_as_array)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36259,36260,36261,36262,36263,36264,36265,36266,36267,36268
0,0.0,6.233017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.558254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,4.674763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.558254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
362,0.0,1.558254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
363,0.0,1.558254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
364,0.0,14.024289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Numpy Array in `transformed_documents_as_array` is converted to a format where every **td-idf** score for every term in every document is represented. Sparse matrices, in contrast, exclude zero-value term scores.

Every term must be represented so that each document has the same number of values, one for each word in corpus.

In [8]:
# make the output folder if it doesn't already exist
Path(".../tf_idf_output").mkdir(parents=True, exist_ok=True)

In [None]:
# construct a list of output file paths using the previous list of text files the relative path for tf_idf_output
output_filenames = [str(txt_file).replace(".txt", ".csv").replace('.../data', "tf_idf_output/") for txt_file in all_txt_files]

# loop each item in transformed_documents_as_array, using enumerate to keep track of the current position
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

    # output to a csv using the enumerated value for the filename
    one_doc_as_df.to_csv(output_filenames[counter])