# Doing things with text 2

## Counting words from a \*preprocessed\* text

### Import packages

In [None]:
from bs4 import BeautifulSoup
import unicodedata
import re
import os
from nltk.tokenize import word_tokenize  # needs to be installed first via nltk.download()
from nltk.corpus import stopwords  # needs to be installed first via nltk.download()
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
from collections import Counter

### Import and read text file

In [None]:
indir = r'/path/to/indir/'
outdir = r'/path/to/outdir/'
os.makedirs(os.path.dirname(outdir), exist_ok=True) # makes outdir if it doesn't exist already

In [None]:
file = 'infile.txt' # change 'infile' for actual file name
file_path = indir + file

In [None]:
with open(file_path, encoding='utf8') as f:
    text = f.read()

In [None]:
print(text[:100])

In [None]:
input_as_list = [x for x in text.split(' ')]

In [None]:
print(input_as_list[:100])

In [None]:
print(len(input_as_list))

**If needed, remove custom stop words**

In [None]:
def custom_stop_words(words):
    """ Given a list of words and custom stop words, remove custom stop words """
    new_words = []
    for word in words:
        if word not in custom_words:
            new_words.append(word)
    return new_words

In [None]:
custom_words = ['the', 'and', 'that', 'with', 'said', 'this', 'when', 
                'them', 'were', 'from', 'will', 'there', 'they', 'then', 
                'their', 'your', 'would', 'only', 'even', 'know', 'could', 
                'have', 'where', 'come', 'been', 'made', 'well']

In [None]:
input_as_list = custom_stop_words(input_as_list)

## Identify and count most common words

### **Step 1: most common words globally**

In [None]:
number_top_words = 20 # set number of most common words to print/plot

In [None]:
word_counts_total = Counter(input_as_list)
most_common_total = word_counts_total.most_common(number_top_words)

**Count the total number of types (unique words) in text from 'file' after preprocessing**

In [None]:
print("The total number of types in \'%s\' is: %s" %(str(file), len(word_counts_total)))

**Calculate lexical diversity by dividing number of types by number of tokens (= type token ratio, or TTR)**

In [None]:
print(f"The type token ratio of \'{str(file)}\' is: {round(len(word_counts_total)/len(input_as_list)*100, 1)}%")

### Visualize most common words in a bar chart

In [None]:
#### From https://stackoverflow.com/questions/63018726/counter-and-plot-the-most-common-word-in-a-text ####

y = [count for word, count in most_common_total]
x = [word for word, count in most_common_total]

plt.rcParams["figure.figsize"] = (20,10)
plt.bar(x, y, color='crimson')
plt.title("Most common terms")
plt.ylabel("Counts")
plt.xlabel("Terms")
plt.rc('xtick',labelsize=12)
plt.rc('ytick',labelsize=12)
#plt.yscale('log') # optionally set a log scale for the y-axis
plt.xticks(rotation=45)
for i, (word, count) in enumerate(most_common_total):
    plt.text(i, count, f' {count} ', rotation=45, size=16,
             ha='center', va='top' if i < 10 else 'bottom', color='white' if i < 10 else 'black')
plt.xlim(-0.6, len(x)-0.4) # optionally set tighter x lims
plt.tight_layout() # change the whitespace such that all labels fit nicely
plt.savefig(outdir + '2015_most_common.png', dpi=200, bbox_inches='tight') # change filename as wished
plt.show()

### Visualize most common words in a word cloud

#### Turn input_as_list into string¶

In [None]:
input_as_string = " ".join(input_as_list)

**generate wordcloud**

In [None]:
text_cloud = WordCloud(background_color='white', stopwords=custom_words).generate(input_as_string)

In [None]:
plt.imshow(text_cloud, interpolation='bilinear')
plt.axis('off')
plt.savefig(outdir + '2015_wordcloud.png', dpi=200, bbox_inches='tight') # change filename as wished
plt.show()

### **Step 2: Most common words by word length**

In [None]:
three_words = []
four_words = []
five_words = []
six_words = []
seven_words = []
eight_words = []
nine_words = []
ten_plus_words = []

for word in input_as_list:
    if len(word) == 3:
        three_words.append(word)
    elif len(word) == 4:
        four_words.append(word)
    elif len(word) == 5:
        five_words.append(word)
    elif len(word) == 6:
        six_words.append(word)
    elif len(word) == 7:
        seven_words.append(word)
    elif len(word) == 8:
        eight_words.append(word)
    elif len(word) == 9:
        nine_words.append(word)
    else:
        ten_plus_words.append(word)

In [None]:
all_word_lists = {'three': three_words, 
                  'four': four_words, 
                  'five': five_words, 
                  'six': six_words, 
                  'seven': seven_words, 
                  'eight': eight_words, 
                  'nine': nine_words, 
                  'ten plus': ten_plus_words}

In [None]:
frequency_dict = {} # Create dictionary for bar chart below

for number, list in all_word_lists.items():
    frequency_dict[number + " letter words"] = len(list)
    print("There are", len(list), number, "letter words in \'%s\'" %(str(file)))

In [None]:
plt.bar(range(len(frequency_dict)), frequency_dict.values(), align='center')
plt.title("Frequency of different word lenghts")
plt.ylabel("Counts")
plt.xlabel("Word lenghts")
plt.xticks(range(len(frequency_dict)), frequency_dict.keys(), rotation=90)
plt.savefig(outdir + '2015_word_lengths.png', dpi=200, bbox_inches='tight') # change filename as wished
plt.show()

**Print the n most common words of different lengths**

In [None]:
def most_common_words(list, n=number_top_words): # number_top_words is defined above
    word_counts = Counter(list)
    top_words = word_counts.most_common(n)
    print('Total number of tokens is: %s' 
          %(len(word_counts))) # to print the total number of unique words of this length
    print('Most frequent words:')
    for word, count in top_words:
        print('%s: %10d' %(word, count))

In [None]:
most_common_words(ten_plus_words) # change list between brackets to print most frequent words with different length