# Doing things with text 5

## TF-IDF on multiple texts

### Import packages

In [None]:
import os
from bs4 import BeautifulSoup
import unicodedata
import re
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
from nltk.corpus import stopwords  # needs to be installed first via nltk.download()
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Define in- and out-directories

Indir is a folder on your computer with multiple text files. Outdir is a folder (to be made) to store cleaned versions of the files

In [None]:
indir = r'/path/to/indir/'
outdir = r'/path/to/outdir/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already

#### User defined stopwords (for wordcloud and Counter)

In [None]:
stopword_list = ['chorus', 'verse', 'outro', 'that', 'with', 'said', 'this', 'when', 'them', 'were', 'from', 'will', 'there', 'they', 'then', 'their', 'your', 'would', 'only', 'even', 'know', 'could', 'have', 'where', 'come', 'been', 'made', 'well', 'would', 'their', 'could', 'there']

## Preprocessing

In [None]:
def remove_html(text):
    """ Use the library BeautifulSoup (bs4) to remove html tags """
    soup = BeautifulSoup(text, "lxml")
    clean_text = soup.get_text()
    return clean_text

def remove_short_words(words, n=5):
    new_words = []
    for word in words:
        if len(word) >= n:
            new_words.append(word)
    return new_words

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Remove all integer occurrences in list of tokenized words
    """
    new_words = []
    for word in words:
        if not word.isdigit():
            new_words.append(word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_stop_words_languages(words, languages=['dutch', 'french']):
    """ Remove stop words from specified languages """
    all_stop_words = []
    for language in languages:
        all_stop_words.extend(stopwords.words(language))
    stop_words = list(set(all_stop_words))
    return remove_stop_words(words, stop_words)

def remove_stop_words(words, stop_words):
    """ Given a list of words and stop words, remove stop words """
    new_words = []
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

In [None]:
def all_operations(words):
    words = remove_non_ascii(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = to_lowercase(words)
    words = remove_stop_words_languages(words)
    words = remove_short_words(words)
    return words

#### Call functions

Stores al text files from indir as strings in a list 

In [None]:
all_docs = []

# list all files in a given directory
files = os.listdir(indir)
file_names = []


for infile in sorted(files):
    # avoid opening files such as .DS_Store
    if infile.startswith('.'):
        continue
    file_names.append(infile[:-4])
    # open the file and do something with it, close when done
    with open(indir+infile, "r") as f:
        # try / except clause to catch encoding errors
        try:
            text = f.read()
        except Exception:
            print(Exception)
    # remove html
    clean_text = remove_html(text)
    # tokenize to words (needed for subsequent operations)
    words = word_tokenize(clean_text)
    words = all_operations(words)
    words_as_string = " ".join(words)
    all_docs.append(words_as_string)

In [None]:
print(file_names)

In [None]:
print(all_docs[2])

## Analysis

### Tf-idf

From: https://programminghistorian.org/en/lessons/analyzing-documents-with-tfidf
See also: https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

In [None]:
vectorizer = TfidfVectorizer(max_df=.65, min_df=1, stop_words=None, use_idf=True, norm=None)
transformed_documents = vectorizer.fit_transform(all_docs)

In [None]:
transformed_documents_as_array = transformed_documents.toarray()
# use this line of code to verify that the numpy array represents the same number of documents that we have in the file list
number_of_docs = len(transformed_documents_as_array)

In [None]:
print(transformed_documents_as_array)

In [None]:
# Instigate n subplots, based on number of documents
fig, axs = plt.subplots(number_of_docs,1, figsize = [20,120], sharey=False)

# Set number of n top terms
n = 20

# construct a list of output file paths using the previous list of text files the relative path for tf_idf_output
output_filenames = [str(txt_file).replace(".txt", ".csv") for txt_file in files_clean]

# loop each item in transformed_documents_as_array, using enumerate to keep track of the current position
for counter, doc in enumerate(transformed_documents_as_array):
    # construct a dataframe
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)

    # output to a csv using the enumerated value for the filename
    one_doc_as_df.to_csv(outdir + '/' + output_filenames[counter])
    
    top_terms = one_doc_as_df[:n]
    ax = axs[counter]
    ax.bar(top_terms['term'], top_terms['score'])
    ax.set_xlabel('Top terms')
    ax.tick_params(labelrotation=45)
    ax.set_ylabel('tf-idf score')
    ax.set_title('Top ' + str(n) + ' terms with highest tf-idf scores in ' + str(output_filenames[counter]))
#plt.savefig('/Users/huijn001/Desktop/No_selected_data1.png', dpi=300)

plt.show()

### Cosine Similarity

In [None]:
vectorizer2 = CountVectorizer()
count_matrix = vectorizer2.fit_transform(all_docs)
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

Turn cosine_sim into pandas dataframe to visualize in heatmap. Name columns and index after correct year

In [None]:
df_all_docs = pd.DataFrame(cosine_sim, columns = file_names)
df_all_docs.index = df_all_docs.index

In [None]:
print(df_all_docs.head())

In [None]:
fig, ax = plt.subplots(figsize=(10,10)) 
sns.heatmap(df_all_docs.corr(), square=True, cmap='RdYlGn', ax=ax)
plt.title('Heatmap of Cosine Similarity scores')
plt.show()