# Doing things with text 3

## Importing exploring multiple texts (from .csv files) as one corpus

Works with csv files that are structured 

### Import packages

In [None]:
import os
import csv
import glob
import re
from tqdm.notebook import tqdm
import pandas as pd
import unicodedata
import re
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
from nltk.corpus import stopwords  # needs to be installed first via nltk.download()
from wordcloud import WordCloud # needs to be installed first via pip install wordcloud
import matplotlib.pyplot as plt 
from collections import Counter

#### Define in- and out-directories

In [None]:
indir = r'/path/to/indir/'
outdir = r'/path/to/outdir/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already
allfiles = glob.glob(os.path.join(indir, "*.csv"))

check what's in 'allfiles':

In [None]:
for file in allfiles:
    print(file)

**check what the data structure of csv's looks like (change 'file.csv' for one of the actual files in indir**)

In [None]:
df_test = pd.read_csv(indir + 'file.csv', sep='\t') # most common separators are ';' or ',' or '\t'
print(df_test.head())

#### User defined stopwords (for wordcloud and Counter). Change if needed!

In [None]:
custom_words = ['het', 'van', 'een', 'dat', 'zijn'] ### add words as list: 'word', 'word', 'word', etc.

In [None]:
def remove_user_defined_stopword_list(words):
    """ Given a hardcoded list of words and stop words, remove stop words """
    new_words = []
    for word in words:
        if word not in custom_words:
            new_words.append(word)
    return new_words

### Import csv's as df (with df['text']) as the text column), merge into one large dataframe called 'data

#### option 1: importing csv's with raw text, preprocessing including tokenization (takes time)

In [None]:
data = pd.DataFrame()

for file in tqdm(allfiles):
    filename = file[-8:-4] # take year from filename to use for outfile
    df = pd.read_csv(file, sep="\t") # usecols = ['column_name', 'column_name'] if needed
    df['text_clean'] = df['text'].str.lower() # where df['text'] is text; change if needed
    df['text_clean'] = [[w for w in word_tokenize(text) if w.isalpha() and len(w) >= 3] 
                        for text in df['text_clean']] 
    df['word_count'] = df['text_clean'].str.len()
    df.to_csv(outdir + ".csv" %(filename), sep=",") # save df to .csv in outdir
    data = pd.concat([data, df], axis=0, ignore_index=True)      

#### option 2: importing csv's with (relatively) clean text, preprocessing without tokenization (= quicker!)

In [None]:
data = pd.DataFrame()

for file in tqdm(allfiles):
    filename = file[-8:-4] # take year from filename to use for outfile
    df = pd.read_csv(file, sep="\t") # usecols = ['column_name', 'column_name'] if needed
    df['text'] = df['text'].str.lower() # where df['text'] is text; change if needed
    df['text'] = df['text'].str.split()
    df['text_clean'] = [[w for w in text if w.isalpha() 
                         and len(w) > 3] for text in df['text']]
    df['word_count'] = df['text_clean'].str.len()
    df.to_csv(outdir + ".csv" %(filename), sep=",") # save df to .csv in outdir
    data = pd.concat([data, df], axis=0, ignore_index=True)  

**Turn text_clean content as list into string**

In [None]:
data['text_clean_str'] = data['text_clean'].apply(' '.join)

In [None]:
print(data.head())

## Make single lists and strings

**Turn text_clean data into big list 'input_as_list'**

In [None]:
input_as_list = [item for sublist in data['text_clean'] for item in sublist]

**Turn text data into big string 'input_as_string'**

In [None]:
input_as_string = " ".join(data['text'])

## Count total number of words

Function to count words in a string by splitting on whitespace

In [None]:
def word_count(string):
    words = string.split()
    return len(words)

**Count total number of tokens (words) in raw text from 'file' before preprocessing**

In [None]:
print("The total number of words in \'%s\' before preprocessing is: %s" 
      %(str(indir), word_count(input_as_string))) # Call function for 'text'

**Count total number of tokens (words) in text from 'file' after preprocessing**

In [None]:
print("The total number of words in \'%s\' after preprocessing is: %s" 
      %(str(indir), len(input_as_list))) # Calculate length of list 'input_as_list'

**Calculate number of tokens removed by preprocessing**

In [None]:
print("The total number of tokens removed by preprocessing is: %s" 
      %(word_count(input_as_string) - len(input_as_list)))

## Identify and count most common words

### **Step 1: most common words globally**

In [None]:
number_top_words = 20 # set number of most common words to print/plot

word_counts_total below is a counter object that counts the frequency for each of the words in input_as_list. It feeds the bar chart below. Words that need removed from the bar chart can be put in the custom stopword list custom_words above

In [None]:
word_counts_total = Counter(remove_user_defined_stopword_list(input_as_list))
most_common_total = word_counts_total.most_common(number_top_words)

**Count the total number of types (unique words) in text from 'file' after preprocessing**

In [None]:
print("The total number of types in \'%s\' is: %s" %(str(indir), len(word_counts_total)))

**Calculate lexical diversity by dividing number of types by number of tokens (= type token ratio, or TTR)**

In [None]:
print(f"The type token ratio of \'{str(indir)}\' is: {round(len(word_counts_total)/len(input_as_list)*100, 1)}%")

### Visualize most common words in a bar chart

In [None]:
#### From https://stackoverflow.com/questions/63018726/counter-and-plot-the-most-common-word-in-a-text ####

y = [count for word, count in most_common_total]
x = [word for word, count in most_common_total]

plt.rcParams["figure.figsize"] = (20,10)
plt.bar(x, y, color='crimson')
plt.title("Most common terms")
plt.ylabel("Counts")
plt.xlabel("Terms")
plt.rc('xtick',labelsize=12)
plt.rc('ytick',labelsize=12)
#plt.yscale('log') # optionally set a log scale for the y-axis
plt.xticks(rotation=45)
for i, (word, count) in enumerate(most_common_total):
    plt.text(i, count, f' {count} ', rotation=45, size=16,
             ha='center', va='top' if i < 10 else 'bottom', color='white' if i < 10 else 'black')
plt.xlim(-0.6, len(x)-0.4) # optionally set tighter x lims
plt.tight_layout() # change the whitespace such that all labels fit nicely
plt.savefig(outdir + '2015_most_common.png', dpi=200, bbox_inches='tight') # change filename as wished
plt.show()

### Visualize most common words in a word cloud

#### Turn input_as_list into string¶

In [None]:
clean_input_as_string = " ".join(data['text_clean_str'])

**generate wordcloud**

In [None]:
text_cloud = WordCloud(background_color='white', stopwords=custom_words).generate(clean_input_as_string)

In [None]:
plt.imshow(text_cloud, interpolation='bilinear')
plt.axis('off')
plt.savefig(outdir + '2015_wordcloud.png', dpi=200, bbox_inches='tight') # change filename as wished
plt.show()

### Print and visualize most common words per dataframe row

In [None]:
for date, row in zip(data['date'], data['text_clean']):
    word_counts = Counter(remove_user_defined_stopword_list(row))
    most_common_words = word_counts.most_common(number_top_words)
    print('Most common words in %s:' %(date))
    for word, count in most_common_words:
        print('%s: %7d' % (word, count))
    print('\n')

#### Most common words per dataframe row in a bar chart

In [None]:
#### From https://stackoverflow.com/questions/63018726/counter-and-plot-the-most-common-word-in-a-text ####

for date, row, total_words in zip(data['date'], data['text_clean'], data['word_count']):
    word_counts = Counter(remove_user_defined_stopword_list(row))
    most_common_words = word_counts.most_common(100)

    y = [count for word, count in most_common_words]
    x = [word for word, count in most_common_words]
    
    plt.rcParams["figure.figsize"] = (20,10)
    plt.bar(x, y, color='crimson')
    plt.title("Top term frequencies in " + str(date))
    plt.ylabel("Counts")
    #plt.yscale('log') # optionally set a log scale for the y-axis
    plt.xticks(rotation=45)
    for i, (word, count) in enumerate(most_common_words):
        plt.text(i, count, f' {count} ', rotation=45,
        ha='center', va='top' if i < 10 else 'bottom', color='white' if i < 10 else 'black')
    plt.xlim(-0.6, len(x)-0.4) # optionally set tighter x lims
    plt.tight_layout() # change the whitespace such that all labels fit nicely
    plt.show()

### **Step 2: Most common words by word length**

In [None]:
three_words = []
four_words = []
five_words = []
six_words = []
seven_words = []
eight_words = []
nine_words = []
ten_plus_words = []

for word in input_as_list:
    if len(word) == 3:
        three_words.append(word)
    elif len(word) == 4:
        four_words.append(word)
    elif len(word) == 5:
        five_words.append(word)
    elif len(word) == 6:
        six_words.append(word)
    elif len(word) == 7:
        seven_words.append(word)
    elif len(word) == 8:
        eight_words.append(word)
    elif len(word) == 9:
        nine_words.append(word)
    else:
        ten_plus_words.append(word)

In [None]:
all_word_lists = {'three': three_words, 
                  'four': four_words, 
                  'five': five_words, 
                  'six': six_words, 
                  'seven': seven_words, 
                  'eight': eight_words, 
                  'nine': nine_words, 
                  'ten plus': ten_plus_words}

In [None]:
frequency_dict = {} # Create dictionary for bar chart below

for number, list in all_word_lists.items():
    frequency_dict[number + " letter words"] = len(list)
    print("There are", len(list), number, "letter words in \'%s\'" %(str(indir)))

In [None]:
plt.bar(range(len(frequency_dict)), frequency_dict.values(), align='center')
plt.title("Frequency of different word lenghts")
plt.ylabel("Counts")
plt.xlabel("Word lenghts")
plt.xticks(range(len(frequency_dict)), frequency_dict.keys(), rotation=90)
plt.savefig(outdir + '2015_word_lengths.png', dpi=200, bbox_inches='tight') # change filename as wished
plt.show()

**Print the n most common words of different lengths**

In [None]:
def most_common_words(list, n=number_top_words): # number_top_words is defined above
    word_counts = Counter(list)
    top_words = word_counts.most_common(n)
    print('Total number of tokens is: %s' 
          %(len(word_counts))) # to print the total number of unique words of this length
    print('Most frequent words:')
    for word, count in top_words:
        print('%s: %10d' %(word, count))

In [None]:
most_common_words(ten_plus_words) # change list between brackets to print most frequent words with different length