## Introduction to Python for Digital Text Analysis (Part I)

This session will provide an overview of the Python Natural Language Toolkit (NLTK) library (http://www.nltk.org).

### Step I: Import necessary packages

In [None]:
import nltk
from nltk.tokenize import TweetTokenizer

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')

### Step II: Read in comment files
Let's choose four popular songs, one from each of the groups, and import their comment text.

In [None]:
# First need to find the comment-only files.
bts_filepath = '../data/kpop_videos_comments/bts/GZjt_sA2eso.txt' # Save Me
exo_filepath = '../data/kpop_videos_comments/exo/yWfsla_Uh80.txt' # Call Me Baby
twice_filepath = '../data/kpop_videos_comments/twice/EpMwiqW8k8o.txt' # 'Signal' Dance Video
blackpink_filepath = '../data/kpop_videos_comments/blackpink/bwmSjveL3Lc.txt' # Boombayah

# Read in the text from the files as strings, using UTF 8 encoding to recognise emoji.
with open(bts_filepath, encoding="utf-8") as text:
    bts = text.read()
with open(exo_filepath, encoding="utf-8") as text:
    exo = text.read()
with open(twice_filepath, encoding="utf-8") as text:
    twice = text.read()  
with open(blackpink_filepath, encoding="utf-8") as text:
    blackpink = text.read()
    
print(bts[:300]) # Print first 300 characters
#print(bts)

### Step III: Tokenise the comments

NLTK word tokenizer (ignores non-alpha characters) vs. tweet tokenizer (represents hashtags, @mentions, and emoji / doesn't strip them away): http://www.nltk.org/api/nltk.tokenize.html

We will use the tweet tokenizer, as it is more applicable to YouTube comments.

In [None]:
for word in TweetTokenizer().tokenize(bts[:300]):
    print(word)

### Step IV: Calculate type / token counts
From this, we can also calculate lexical diversity with a simple function.

In [None]:
#Length of a text (number of words and punctuation symbols).
len(bts)

#All vocabulary items in a text.
#set(bts) #Takes a very long time!

#Sort the vocabulary items in alphabetical order (punctuation comes first, then capitalised words).
#sorted(set(bts)) #Freezes the machine.

In [None]:
#Number of unique vocabulary items (includes punctuation symbols).
len(set(bts))

In [None]:
#Lexical richness (number of distinct words/number of total words).
#How many times on average is each word used? Divide 100 by the result to get this.

def lexical_diversity(comments):
    return len(set(comments))/len(comments)

lexical_diversity(bts)

In [None]:
#How often a specific word appears in a text.
print(bts.count("jimin"))

#Percentage of the text taken up by a specific word.
def word_percentage(wordcount, wordtotal):
    return 100 * wordcount / wordtotal

word_percentage(bts.count("jimin"), len(bts))

#Can look at other band member names and do a simple frequency bar chart to compare their 'popularity' in the comments!

In [None]:
#Frequency distribution of vocabulary items.
fdist1 = FreqDist(bts)
print(bts)
#An "outcome" is a word.

#50 most frequent words.
fdist1.most_common(50)

#Frequency of a specific word.
fdist1["jimin"]

#Cumulative frequency plot. If cumulative=True is not specified, individual frequencies are plotted.
fdist1.plot(50, cumulative=True)

#Words that only occur once.
#fdist1.hapaxes()

#Examine all of the long words (more than 7 letters) in a text.
Vocab = set(bts)
long_words = [word for word in Vocab if len(word)>7]
sorted(long_words)

#Examine only the long words that occur more than 7 times.
fdist5 = FreqDist(bts)
long_frequent_words = [word for word in Vocab if len(word)>7 and fdist5[word]>7]
sorted(long_frequent_words)

### Step V: POS tagging

Compute and visualise frequencies of most popular (proper) nouns, adjectives, verbs. Also frequencies of most popuar words overall...?

In [None]:
nltk.pos_tag(bts)
nltk.pos_tag(bts, tagset="universal")

##Universal Part-of-Speech Tagset
##Tag	Meaning	                English Examples
##ADJ	adjective	        new, good, high, special, big, local
##ADP	adposition	        on, of, at, with, by, into, under
##ADV	adverb	                really, already, still, early, now
##CONJ	conjunction	        and, or, but, if, while, although
##DET	determiner, article	the, a, some, most, every, no, which
##NOUN	noun	                year, home, costs, time, Africa
##NUM	numeral	                twenty-four, fourth, 1991, 14:24
##PRT	particle	        at, on, out, over per, that, up, with
##PRON	pronoun	                he, their, her, its, my, I, us
##VERB	verb	                is, say, told, given, playing, would
##.	punctuation marks	. , ; !
##X	other	                ersatz, esprit, dunno, gr8, univeristy

# Adapt the below code to the Kpop dataset!

#TAGGED CORPORA
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories="news", tagset="universal")
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()
tag_fd.plot(cumulative=True)

#Which parts of speech occur before a noun?
word_tag_pairs = nltk.bigrams(brown_news_tagged) #Bigrams consist of word-tag pairs.
noun_preceders = [a[1] for (a, b) in word_tag_pairs if b[1] == "NOUN"]
fdist_noun_preceders = nltk.FreqDist(noun_preceders)
fdist_noun_preceders.most_common() #Displays tags and frequencies.
[tag for (tag, _) in fdist_noun_preceders.most_common()] #Just displays tags.

#What are the most common verbs in the Wall Street Journal corpus?
wsj = nltk.corpus.treebank.tagged_words(tagset="universal")
word_tag_fd = nltk.FreqDist(wsj)
word_tag_fd.most_common(50)
#[wordtag[0] for (wordtag, _) in word_tag_fd.most_common() if wordtag[1] == "VERB"] #Sort verbs by frequency.

#Frequency-ordered list of POS tags given a word. Word is treated as a condition and its tag as an event.
cfd1 = nltk.ConditionalFreqDist(wsj)
cfd1["yield"].most_common()

#Reverse the order of the pairs to see likely words for a given POS tag.
wsj2 = nltk.corpus.treebank.tagged_words()
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
list(cfd2["VBN"])

### Step VI: Bigrams & Collocations

In [None]:
#Extract a list of word pairs from a text.
from nltk import bigrams
list(bigrams(["more", "is", "said", "than", "done"]))

#Collocations: bigrams that occur more often than we would expect based on the frequency of the individual words.
bts.collocations()

#Distribution of word lengths in a text.
word_lengths = [len(word) for word in bts]
fdist1wordlength = FreqDist(word_lengths)
print(fdist1wordlength)

#Most common word lengths.
fdist1wordlength.most_common()

#Most frequent word length.
fdist1wordlength.max()

#How many words of length 3 appear in the text.
fdist1wordlength[3]

#What proportion of all word lengths are words of length 3?
fdist1wordlength.freq(3)

#### Open-Ended Exercises/Questions
1. What are the most common 3-grams, 4-grams..?
2. Compare most frequent words (and types of words) in each of the four video comment datasets.