# Doing things with text 4

## Importing (and analyzing) multiple texts as one corpus

### Import packages

In [None]:
import os
import os
import csv
import glob
import re
from tqdm.notebook import tqdm
import pandas as pd
import unicodedata
import re
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
from nltk.corpus import stopwords  # needs to be installed first via nltk.download()
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
from collections import Counter

#### Define in- and out-directories

Indir is a folder on your computer with multiple text files. Outdir is a folder (to be made) to store cleaned versions of the files

In [None]:
indir = r'/Users/huijn001/surfdrive/data_lokaal/test/'
outdir = r'/Users/huijn001/surfdrive/data_lokaal/test1/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already
allfiles = glob.glob(os.path.join(indir, "*.csv"))

#### User defined stopwords (for wordcloud and Counter). Change if needed!

In [None]:
stopword_list = [] ### add words as list: 'word', 'word', 'word', etc.

In [None]:
def remove_user_defined_stopword_list(words):
    """ Given a hardcoded list of words and stop words, remove stop words """
    new_words = []
    for word in words:
        if word not in stopword_list:
            new_words.append(word)
    return new_words

Import csv's as df (only columns 'date', 'Content' and 'year'), merge into one large dataframe 'data'

In [None]:
data = pd.DataFrame()

for filename in tqdm(allfiles):
    df = pd.read_csv(filename, sep=";", usecols = ['date', 'Content', 'year'])
    df['text_clean'] = df['Content'].str.lower()
    df['text_clean'] = [[w for w in word_tokenize(text) if w.isalpha() and len(w) > 3] for text in df['text_clean']] 
    df['word_count'] = df['text_clean'].str.len()
    data = pd.concat([data, df], axis=0, ignore_index=True)      

Turn text_clean content as list into string:

In [None]:
data['text_clean_str'] = data['text_clean'].apply(" ".join)

In [None]:
print(data.head())

## Analysis

### Word clouds

In [None]:
all_texts_list = []

for list in data['text_clean']:
    all_texts_list.append(list)

In [None]:
print(all_texts_list[1000])

In [None]:
all_texts_string = " ".join(data['text_clean_str'])

In [None]:
text_cloud = WordCloud(background_color='white', stopwords=stopword_list).generate(all_texts_string)

In [None]:
plt.imshow(text_cloud, interpolation='bilinear')
plt.axis('off')
#plt.savefig('/Users/huijn001/Desktop/got.png', dpi=300, bbox_inches='tight') # To save word cloud to your computer
plt.show()

### Word counts

In [None]:
for date, row in zip(data['date'], data['text_clean']):
    row = remove_user_defined_stopword_list(row)
    word_counts = Counter(row)
    most_common_words = word_counts.most_common(100)
    print('Most common words in ' + date)
    for word, count in most_common_words:
        print('%s: %7d' % (word, count))

#### Visualize word counts in all texts in a bar chart

In [None]:
#### From https://stackoverflow.com/questions/63018726/counter-and-plot-the-most-common-word-in-a-text ####

for date, row, total_words in zip(data['date'], data['text_clean'], data['word_count']):
    row = remove_user_defined_stopword_list(row)
    word_counts = Counter(row)
    most_common_words = word_counts.most_common(100)

    y = [count for word, count in most_common_words]
    x = [word for word, count in most_common_words]
    
    plt.rcParams["figure.figsize"] = (20,10)
    plt.bar(x, y, color='crimson')
    plt.title("Top term frequencies in " + str(date))
    plt.ylabel("Counts")
    #plt.yscale('log') # optionally set a log scale for the y-axis
    plt.xticks(rotation=45)
    for i, (word, count) in enumerate(most_common_words):
        plt.text(i, count, f' {count} ', rotation=45,
        ha='center', va='top' if i < 10 else 'bottom', color='white' if i < 10 else 'black')
    plt.xlim(-0.6, len(x)-0.4) # optionally set tighter x lims
    plt.tight_layout() # change the whitespace such that all labels fit nicely
    plt.show()