Tutorial on YouTube Chanel:
Text Visualization | Lecture 2 | CPE 393 Text Analytics
https://www.youtube.com/watch?v=rAQCQKnkNh0&t=774s

##### <b>Load Dataset</b>

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [None]:
df = pd.read_csv('un-general-debates.csv')
df.sample(5)

In [None]:
df.count()

In [None]:
# Primary key

df[['session', 'country']].drop_duplicates().count()

##### <b>DataFrame Summary Statistics</b>

In [None]:
df['length'] = df['text'].str.len()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df[['country']].describe(include='O').T

##### <b>Check Missing Data</b>

In [None]:
df.isna().sum()

##### <b>Plotting distribution</b>

In [None]:
df['length'].plot(kind='box', vert=False, figsize=(8,1))

In [None]:
df['length'].plot(kind='hist', bins=30, figsize=(8,4), edgecolor='k')

##### <b>Distribution across categories</b>

In [None]:
import seaborn as sns

where = df['country'].isin(['USA', 'FRA', 'GBR', 'CHN', 'RUS'])
sns.catplot(data=df[where], x='country', y='length', kind='box')
sns.catplot(data=df[where], x='country', y='length', kind='violin')

##### <b>Developement over time Number of countries</b>

In [None]:
df.groupby('year').size().plot(title='Number of Countries')

In [None]:
df.groupby('year').agg({'length':'mean'}).plot(title='Avg. Speech Length', ylim=(0,30000))

##### <b>Simple Text Processing</b>

In [None]:
# Case Folding
str.lower('Hello World')

In [None]:
# Tokenization

import regex as re
def tokenize(text):
    return re.findall(r'[\w-]*\p{L}[\w-]*', text)

In [None]:
text = "Let's defeat SARS-Coc-2 together in 2021!"
tokens = tokenize(text)
print("|".join(tokens))

In [None]:
# Stop word removal
import nltk
nltk.download('stopwords')

stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords

In [None]:
def remove_stop(tokens):
    return [t for t in tokens if t.lower() not in stopwords]

In [None]:
remove_stop(tokens)

In [None]:
# Processing a pipeline

pipeline = [str.lower, tokenize, remove_stop]

def prepare(text, pipeline):
    tokens = text
    for transform in pipeline:
        tokens = transform(tokens)
    return tokens

In [None]:
prepare(text, pipeline)

In [None]:
# Apply pipeline

# Series: map
# Series: map
# DaraFrame: applymap
# DataFrame: applymap

df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)
df.sample(10)

In [None]:
# Counting number of tokens (words)
df['num_tokens'] = df['tokens'].map(len)
df.sample(10)

##### <b>Word Frequency Analysis</b>

In [None]:
from collections import Counter

tokens = tokenize("She likes my cats and my cats like my sofa")
counter = Counter(tokens)
print(counter)

In [None]:
more_tokens = tokenize("She likes dogs and cats")
counter.update(more_tokens)
print(counter)

In [None]:
%%time
import numpy as np
tokens = df['tokens'].explode().values
counter = Counter(tokens)
# print(counter)
print(counter.most_common(5))

In [None]:
%%time
counter = Counter()
df['tokens'].map(counter.update)

print(counter.most_common(5))

In [None]:
# Word Counting, DataFrame Version

def count_words(df, column='tokens', preprocess=None, min_freq=2):

    # process tokens and update counter
    def update(doc):
        tokens = doc if preprocess is None else preprocess(doc)
        counter.update(tokens)
    
    # create counter and run through all data
    counter = Counter()
    df[column].map(update)

    # tranform counter into a DataFrame
    freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
    freq_df = freq_df.query('freq > @min_freq')
    freq_df.index.name = 'token'

    return freq_df.sort_values('freq', ascending=False)

In [None]:
freq_df = count_words(df)
freq_df.head(5)

In [None]:
# Counting words with preprocessing

# Count words with 10 or more characters
count_words(df, column='text',
            preprocess=lambda text: re.findall(r"\w{10,}", text))

##### <b>Frequency Plot</b>

In [None]:
ax = freq_df.head(15).plot(kind='barh', width=0.8, figsize=(8,4))
ax.invert_yaxis()
ax.set(xlabel='Frequency', ylabel='Token', title='Top Words')

##### <b>Word Cloud</b>

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = df.query("year==2015 and country=='USA'")['text'].values[0]
text

In [None]:
wc = WordCloud(max_words=100, stopwords=stopwords)
wc.generate(text)
plt.figure(dpi=150)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')

##### <b>Keyword-in-Context Function</b>

In [None]:
from textacy.extract.kwic import keyword_in_context

In [None]:
from textacy.extract.kwic import keyword_in_context
import random

def kwic(doc_series, keyword, window=35, print_sample=5):
    
    def add_kwic(text):
        kwic_list.extend(keyword_in_context(text, keyword, ignore_case=True, window_width=window))

    kwic_list = []
    doc_series.map(add_kwic)

    if print_sample is None or print_sample==0:
        return kwic_list
    else:
        k = min(print_sample, len(kwic_list))
        print(f"{k} random samples out of {len(kwic_list)} " + \
              f"contexts for '{keyword}':")
        for sample in random.sample(list(kwic_list), k):
            print(re.sub(r'[\n\t]', ' ', sample[0]) + ' ' + \
                  sample[1] + ' ' + \
                    re.sub(r'[\n\t]', ' '. sample[2]))

In [None]:
kwic(df[df['year']==2005]['text'], 'sdgs', print_sample=5)

##### <b>Word Cloud Framework</b>

In [None]:
def wordcloud(word_freq, title=None, max_word=200, stopwords=None):
    wc = WordCloud(width=800, height=400,
                   background_color='black', colormap='Paired',
                   max_font_size=150, max_words=max_word)
    
    # convert data frame into dict