In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Data Loader
import loader 

# Import numpy and pandas
import numpy as np
import pandas as pd

# Import Plotting tools
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import collections as matcoll

# Utils
import tools
import math

%matplotlib inline

# Load data in raw
raw = loader.LoadRaw()

In [None]:
def hist(data, title = ''):
    return pd.DataFrame({title: data}).hist(grid = False, bins = 15)

def box(data):
    return pd.DataFrame(data).boxplot(grid = False, figsize = (5, 9))

def printStats(data):
    print("Number of documents {}".format(len(data)))
    print("Number of empyty documents {}".format(sum([1 if len(d) == 0 else 0 for d in data])))

    Llist1 = list(map(len, data))
    print("Longest document {}".format(max(Llist1)))
    print("Average length of document {}".format(sum(Llist1)/len(data)))
    
    print("Number of tokens {}".format(sum(Llist1)))
    vocab = tools.vocabCreater(data)
    print("Number of unique tokens {}".format(len(vocab)))
    
    Llist2 = list(map(len, vocab))
    print("Longest token {}".format(max(Llist2)))
    print("Average token length {}".format(sum(Llist2)/len(vocab)))
    
    hist(Llist1, 'Histeogram over document lengths')
    hist(Llist2, 'Histeogram over token lengths')
    plt.show()
    
    plt.subplot(1, 2, 1)
    box({'Document Lengths (Shaken)': Llist1 + + np.random.uniform(-1,1,len(Llist1))})
    plt.title( '' )
    
    
    plt.subplot(1, 2, 2)
    np.random.normal(0,1,100)
    box({'Token Lengths (shaken)': Llist2 + np.random.uniform(-1,1,len(Llist2))})
    
    plt.title( '' )
    plt.show()
    
def PipeLineWords(data):
    tokens = tools.tokenize(data)
    stemmed = tools.stemDocument(tokens)
    added = tools.getMoreTokens(stemmed, 2)
    cleaned = tools.cleanDoc(added, 3, 50)
    return tokens, stemmed, added, cleaned

def cleanPersonData(data):
    processed = []
    for person in data:
        tokens, stemmed, added, cleaned = PipeLineWords(person)
        raw = tools.rawTokenize(person)
        processed.append(raw)
    return processed

def lineScatter(x, y):
    lines = []
    for i in range(len(x)):
        pair=[(x[i],0), (x[i], y[i])]
        lines.append(pair)
        
    linecoll = matcoll.LineCollection(lines)
    fig, ax = plt.subplots()
    ax.add_collection(linecoll)

    plt.scatter(x,y)

    plt.xticks(x)
    plt.ylim(0,max(y) + min(y))

    plt.show()

def personalStats(data):
    cleaned = cleanPersonData(data)
    tokens = list(map(sum, [[len(sent) for sent in participant] for participant in cleaned]))
    y = list(map(len, cleaned))
    x = range(len(data))
    
    avg = sum(y)/len(x)
    
    lineScatter(x, y)
    lineScatter(x, tokens)
        
    print("-"*20)
    print("Average number of sentences for each participant")
    print(sum(y)/len(data))
    print("Persentage of the 3 most active participants sentence split")
    print((y[11] + y[12] + y[13])/sum(y))
    print("Persentage of the 3 most active participants tokens split")
    print((tokens[11] + tokens[13] + tokens[3])/sum(tokens))
    print("Total Sum of tokens")
    print(sum(tokens))
    print("-"*20)
    
def display(vocab, n = 5, start = 0, title = ""):
    
    vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
    
    labels = np.array(vocab)[start:start+n,0]
    counts = np.array(vocab)[start:start+n,1].astype(int)

    bar_width = 0.35

    indexes = np.arange(len(labels))
    fig = plt.figure()
    plt.figure(figsize=(20,10))

    plt.bar(indexes, counts)

    # add labels
    plt.xticks(indexes + bar_width, labels)
    plt.ylabel('Count', fontsize=16)    
    plt.title(title, fontsize=20)
    
    plt.show()
    
    
def top_words(dis, count, index, num = 5, start = 0):

    for i in index:
        s = sorted(dis[i].items(), key=lambda x: x[1], reverse=True)
        print("Cluster {} with {} documents : {}".format(i, count[i], s[start:start+num]))
    
    
def formatTFIDFString(clust):

    formatted = []
    for word, value in clust:
        formatted.append("{}: {:.1f}".format(word, value))
        
    return formatted


def tf_idf_creator(dis, index):
    # First we build a dictionary to look up the count of the words in each cluster
    cross_vocab = {}

    for i, clus in enumerate(dis):
        for word, count in clus.items():
            if word not in cross_vocab:
                cross_vocab[word] = np.zeros(len(index))
                cross_vocab[word][i] += 1
            else:
                cross_vocab[word][i] += 1

    rescored = [{} for _ in range(len(index))]
    for i in index:
        newDict = {}
        for word, count in dis[i].items():
            newDict[word] = count * math.log(len(index)/sum(cross_vocab[word]))
        rescored[i] = newDict
        
    return rescored
    
    
# Simple tf-idf sorter
def top_tf_idf(dis, doc_count, index, num = 5, start = 0):
    
    rescored = tf_idf_creator(dis, index)
    for i in index:
        s = sorted(rescored[i].items(), key=lambda x: x[1], reverse=True)
        print("Cluster {} with {} documents : {}".format(i, doc_count[i], formatTFIDFString(s[start:start+num])))

        
def doc_tabel(data):
    tokens, stemmed, added, cleaned = PipeLineWords(data)
    
    stemmed_vocab = tools.vocabCreater(added)
    newAr = []    
    vocab_count = {l: v for l, v in stemmed_vocab.items()} 
    
    for i, sent in enumerate(stemmed):
        nSent = tools.cleanSent(tools.getMoreTokens([sent])[0], vocab_count)
        newAr.append([data[i], nSent])
            
    df = pd.DataFrame(newAr)
    df.columns =["Sentence", "Cleaned"]
    return df

In [None]:
data = tools.getMoreSent(raw)
tokens, stemmed, added, cleaned = PipeLineWords(data)
rawTokens = tools.rawTokenize(data)
cleanedVocab = sorted(tools.vocabCreater(cleaned).items(), key=lambda x: x[1], reverse=True)

In [None]:
print("Data Stats")
print("**"*24)
print("Total chars {}".format(len(raw)))

print("{} {} {}".format("*"*18, "Raw Tokens", "*"*18))
printStats(rawTokens)
print("{} {} {}".format("*"*20, "Tokens", "*"*20))
printStats(tokens)
print("{} {} {}".format("*"*19, "Stemmed", "*"*20))
printStats(stemmed)
print("{} {} {}".format("*"*17, "Added Tokens", "*"*17))
printStats(added)
print("{} {} {}".format("*"*19, "Cleaned", "*"*20))
printStats(cleaned)

In [None]:
personData = loader.loadDataParticipants()
personalStats(personData)

In [None]:
lookup = tools.stemmedReverse(tokens)
ngramVocab = tools.vocabCreater(tools.getNTokens(rawTokens))
vocab = tools.vocabCreater(tokens)
stemmed_vocab = tools.vocabCreater(stemmed)

In [None]:
display(vocab, 20, title = "Most commen words")
display(stemmed_vocab, 20, title = "Most common stemmed words")
display(ngramVocab, 20, title="Most common bi-grams with stopwords")
display(tools.vocabCreater(tools.getNTokens(stemmed)), 20, title="Most common bi-grams without stopwords")
display(tools.vocabCreater(rawTokens), 20, title="Most common stopwords")

In [None]:
pd.set_option('display.max_colwidth', None)
pd.options.display.max_rows = 100
doc_tabel(data).head(10)