# Doing things with text 5

## TF-IDF on multiple texts _for preprocessed texts_

### Step 1: Importing required packages

- `pathlib.Path`: Provides an object-oriented interface for filesystem paths
- `sklearn.TfidfVectorizer`: Transforms text data into TF-IDF features for text analysis and machine learning.
- `sklearn.CountVectorizer`: Converts text data into a matrix of token counts for analysis.
- `sklearn.cosine_similarity`: Computes the cosine similarity between vectors for similarity analysis.
- `sklearn.linear_kernel`: Calculates the linear kernel (dot product) for vector similarity or SVMs.
- `pandas`: Provides data structures and tools for data manipulation and analysis.
- `matplotlib.pyplot`: Creates static, animated, and interactive visualizations in Python.
- `seaborn`: Builds on matplotlib for enhanced statistical data visualization.

In [None]:
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Step 2: Define input and output paths

In [None]:
indir = Path('/Path/to/indir/')
outdir = Path('/Path/to/outdir/')
outdir.mkdir(parents=True, exist_ok=True) # Create the output directory if it doesn't exist

allfiles = sorted(indir.glob("*.txt"))

dataset = 'dataset' # give a name to your dataset for outfiles

In [None]:
def save_corpus(corpus):
    corpus_out = corpus.replace(" ", "_").lower()
    return corpus_out

### Step 3: Load the data

Stores al text files from indir as strings in a list input_as_list

In [None]:
file_names = []
all_docs = []

for infile in allfiles:
    # add filename to list file_names
    file_names.append(infile.stem) 
    # open the file and do something with it, close when done
    with open(infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
    # add text to list of text strings all_docs       
    all_docs.append(text)

In [None]:
for file_name in file_names:
    print(file_name)

### Step 4: Tf-idf analysis

From: https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf
See also: https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

In [None]:
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(all_docs)

In [None]:
transformed_documents_as_array = transformed_documents.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
len(file_names) == len(transformed_documents_as_array)

In [None]:
print(transformed_documents_as_array)

In [None]:
outdir_tfidf_csv = outdir / f'{save_corpus(dataset)}_tfidf_csv/'
outdir_tfidf_csv.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist
outdir_tfidf_png = outdir / f'{save_corpus(dataset)}_tfidf_png/'
outdir_tfidf_png.mkdir(parents=True, exist_ok=True)  # Create the output directory if it doesn't exist

# Set number of n top terms
n = 20

# Loop through each document
for counter, doc in enumerate(transformed_documents_as_array):
    # Get the original filename without the extension
    original_filename = file_names[counter]

    # Construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, 
                                              columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

    # Output to CSV
    one_doc_as_df.to_csv(outdir_tfidf_csv / f'{original_filename}_tfidf.csv')

    # Select top terms
    top_terms = one_doc_as_df[:n]
    
    # Create a separate figure for each document
    plt.figure(figsize=(10, 6))
    plt.bar(top_terms['term'], top_terms['score'])
    plt.xlabel('Top terms')
    plt.ylabel('tf-idf score')
    plt.title(f'Top {n} terms with highest tf-idf scores in {original_filename}')
    plt.xticks(rotation=45)
    
    plt.savefig(outdir_tfidf_png / f"{original_filename}_tfidf.png", dpi=300) # Save individual chart with the original filename
    plt.show()
    plt.close()  # Close the figure to free memory

### Step 5: Cosine Similarity

In [None]:
vectorizer2 = CountVectorizer()
count_matrix = vectorizer2.fit_transform(all_docs)
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

Turn cosine_sim into pandas dataframe to visualize in heatmap. Name columns and index after correct year

In [None]:
df_all_docs = pd.DataFrame(cosine_sim, columns = file_names)
df_all_docs.index = df_all_docs.index

In [None]:
print(df_all_docs.head())

In [None]:
fig, ax = plt.subplots(figsize=(10,10)) 
sns.heatmap(df_all_docs.corr(), square=True, cmap='RdYlGn', ax=ax)
plt.title('Heatmap of Cosine Similarity scores')
plt.savefig(outdir / f'{dataset}_heatmap.png', dpi=300)
plt.show()