# Doing things with text 5

## TF-IDF on multiple texts _for preprocessed texts_

### Import packages

In [None]:
import os
from bs4 import BeautifulSoup
import unicodedata
import re
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
from nltk.corpus import stopwords  # needs to be installed first via nltk.download()
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Define in- and out-directories

In [None]:
indir = r'/path/to/indir/'
outdir = r'/path/to/outdir/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already

dataset = 'dataset' # give a name to your dataset for outfiles

#### User defined stopwords (for Counter)

In [None]:
stopword_list = [] # ad custom words as 'word', 'word', 'word', etc.

### Open text files

Stores al text files from indir as strings in a list 

In [None]:
file_names = []
input_as_list = []
all_docs = []


# list all files in a given directory
files = os.listdir(indir)
files = [f for f in files if not f.startswith('.')]

for infile in files:
    # avoid opening files such as .DS_Store
    if infile.startswith('.'):
        continue
    # avoid opening files other than .txt
    if infile.endswith('.txt'): 
        # add filename to list file_names
        file_names.append(infile[:-4]) 
        # open the file and do something with it, close when done
        with open(indir+infile, "r") as f:
            # try / except clause to catch encoding errors
            try:
                text = f.read()
            except Exception:
                print(Exception)
        # add text to list of text strings all_docs       
        all_docs.append(text)
        # add text to list of words input_as_list
        words = [x for x in text.split(' ')]
        input_as_list.extend(words)

In [None]:
for file_name in file_names:
    print(file_name)

## Analysis

### Tf-idf

From: https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf
See also: https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

In [None]:
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(all_docs)

In [None]:
transformed_documents_as_array = transformed_documents.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
number_of_docs = len(transformed_documents_as_array)

In [None]:
print(transformed_documents_as_array)

In [None]:
# Instigate n subplots, based on number of documents
fig, axs = plt.subplots(number_of_docs,1, figsize = [20,20], sharey=False)

# Set number of n top terms
n = 20

# construct a list of output file paths using the previous list of text files the relative path for tf_idf_output
output_filenames = [str(txt_file).replace(".txt", ".csv") for txt_file in files]

# loop each item in transformed_documents_as_array, using enumerate to keep track of the current position
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, 
                                              columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

    # output to csv in outdir using the enumerated value for the filename
    one_doc_as_df.to_csv(outdir + '/' + output_filenames[counter])
    
    top_terms = one_doc_as_df[:n]
    ax = axs[counter]
    ax.bar(top_terms['term'], top_terms['score'])
    ax.set_xlabel('Top terms')
    ax.tick_params(labelrotation=45)
    ax.set_ylabel('tf-idf score')
    ax.set_title('Top ' + str(n) + ' terms with highest tf-idf scores in ' + str(output_filenames[counter]))

plt.savefig(outdir + dataset + '_tf_idf.png', dpi=300)

plt.show()

### Cosine Similarity

In [None]:
vectorizer2 = CountVectorizer()
count_matrix = vectorizer2.fit_transform(all_docs)
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

Turn cosine_sim into pandas dataframe to visualize in heatmap. Name columns and index after correct year

In [None]:
df_all_docs = pd.DataFrame(cosine_sim, columns = file_names)
df_all_docs.index = df_all_docs.index

In [None]:
print(df_all_docs.head())

In [None]:
fig, ax = plt.subplots(figsize=(10,10)) 
sns.heatmap(df_all_docs.corr(), square=True, cmap='RdYlGn', ax=ax)
plt.title('Heatmap of Cosine Similarity scores')
plt.savefig(outdir + dataset + '_heatmap.png', dpi=300)
plt.show()