In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
import os
import re
import string

# DATA

In [5]:
df = pd.read_csv('./data/stackexchange_812k.csv', keep_default_na=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df.head()

Unnamed: 0,post_id,parent_id,comment_id,text,category
0,1,,,Eliciting priors from experts,title
1,2,,,What is normality?,title
2,3,,,What are some valuable Statistical Analysis op...,title
3,4,,,Assessing the significance of differences in d...,title
4,6,,,The Two Cultures: statistics vs. machine learn...,title


In [7]:
df['clean_text'] = df.text.apply(lambda s: s.lower())

In [8]:
df.head()

Unnamed: 0,post_id,parent_id,comment_id,text,category,clean_text
0,1,,,Eliciting priors from experts,title,eliciting priors from experts
1,2,,,What is normality?,title,what is normality?
2,3,,,What are some valuable Statistical Analysis op...,title,what are some valuable statistical analysis op...
3,4,,,Assessing the significance of differences in d...,title,assessing the significance of differences in d...
4,6,,,The Two Cultures: statistics vs. machine learn...,title,the two cultures: statistics vs. machine learn...


## Clean punctuations

In [9]:
df.clean_text = df.clean_text.str.replace('[{}]'.format(string.punctuation), '')
df.head()

Unnamed: 0,post_id,parent_id,comment_id,text,category,clean_text
0,1,,,Eliciting priors from experts,title,eliciting priors from experts
1,2,,,What is normality?,title,what is normality
2,3,,,What are some valuable Statistical Analysis op...,title,what are some valuable statistical analysis op...
3,4,,,Assessing the significance of differences in d...,title,assessing the significance of differences in d...
4,6,,,The Two Cultures: statistics vs. machine learn...,title,the two cultures statistics vs machine learning


## Tokenize

In [10]:
df.head()

Unnamed: 0,post_id,parent_id,comment_id,text,category,clean_text
0,1,,,Eliciting priors from experts,title,eliciting priors from experts
1,2,,,What is normality?,title,what is normality
2,3,,,What are some valuable Statistical Analysis op...,title,what are some valuable statistical analysis op...
3,4,,,Assessing the significance of differences in d...,title,assessing the significance of differences in d...
4,6,,,The Two Cultures: statistics vs. machine learn...,title,the two cultures statistics vs machine learning


# Data Exploration

In [11]:
df[df.category == 'title']

Unnamed: 0,post_id,parent_id,comment_id,text,category,clean_text
0,1,,,Eliciting priors from experts,title,eliciting priors from experts
1,2,,,What is normality?,title,what is normality
2,3,,,What are some valuable Statistical Analysis op...,title,what are some valuable statistical analysis op...
3,4,,,Assessing the significance of differences in d...,title,assessing the significance of differences in d...
4,6,,,The Two Cultures: statistics vs. machine learn...,title,the two cultures statistics vs machine learning
...,...,...,...,...,...,...
91747,279985,,,Long-term survivor analysis: Are usual methods...,title,longterm survivor analysis are usual methods s...
91748,279989,,,Multiple regression when one predictor is the ...,title,multiple regression when one predictor is the ...
91749,279990,,,Does significance level of individual coeffici...,title,does significance level of individual coeffici...
91750,279993,,,Choosing best discrete classifier in ROC analysis,title,choosing best discrete classifier in roc analysis


In [12]:
df[df.category == 'comment']

Unnamed: 0,post_id,parent_id,comment_id,text,category,clean_text
259056,1,,669,"Although I've accepted an answer, I would reco...",comment,although ive accepted an answer i would recomm...
259057,2,,494,did you try google/ wikipedia first ? \nhttp:/...,comment,did you try google wikipedia first \nhttpenwi...
259058,3,,1,Could be a poster child fo argumentative and s...,comment,could be a poster child fo argumentative and s...
259059,3,,15,"Maybe the focus shouldn't be on ""valuable"" but...",comment,maybe the focus shouldnt be on valuable but ra...
259060,3,,36,"Or maybe even ""How X will help you get Y done ...",comment,or maybe even how x will help you get y done f...
...,...,...,...,...,...,...
812127,279994,,536471,"It does run, and gives very valid looking esti...",comment,it does run and gives very valid looking estim...
812128,279998,,536439,It seems to me that you are correct; the doubl...,comment,it seems to me that you are correct the double...
812129,279998,,536514,It wouldn't be the first time a grader has mis...,comment,it wouldnt be the first time a grader has miss...
812130,279999,,536802,The basic idea is to compare the clustering co...,comment,the basic idea is to compare the clustering co...


In [13]:
df[df.category == 'post']

Unnamed: 0,post_id,parent_id,comment_id,text,category,clean_text
91752,1,,,<p>How should I elicit prior distributions fro...,post,phow should i elicit prior distributions from ...
91753,2,,,<p>In many different statistical methods there...,post,pin many different statistical methods there i...
91754,3,,,<p>What are some valuable Statistical Analysis...,post,pwhat are some valuable statistical analysis o...
91755,4,,,<p>I have two groups of data. Each with a dif...,post,pi have two groups of data each with a differ...
91756,5,3.0,,"<p>The R-project</p>\n\n<p><a href=""http://www...",post,pthe rprojectp\n\npa hrefhttpwwwrprojectorghtt...
...,...,...,...,...,...,...
259051,279990,,,<p>Does significance level of individual coeff...,post,pdoes significance level of individual coeffic...
259052,279993,,,<p>I'm trying to do ROC analysis on my dataset...,post,pim trying to do roc analysis on my datasetp\n...
259053,279994,279989.0,,"<p>Theoretically, it won't work. VIF will pop ...",post,ptheoretically it wont work vif will pop becau...
259054,279998,,,"<p>Let V have exponential(a) density, and let ...",post,plet v have exponentiala density and let w be ...


# Splitting data

In [14]:
training_data = df[df.category == 'title']
test_data = df[df.category != 'title']

# Build Ngram model

In [15]:
from collections import defaultdict
from collections import Counter
from nltk.util import ngrams

left_pad_symbol = "<s>"
right_pad_symbol = "</s>"

In [16]:
def get_ngram_sents(texts, n=2):
    """ return n-gram tuples
    """
    ngram_sents = []
    for text in texts:
        ngram_sents.extend(list(ngrams(text.split(), n)))
    return ngram_sents

def get_ngram_of_sent(sentence, n=2):
    """ return n-gram word tuples from a given sentence
    """
    return list(ngrams(sentence.split(), n))

In [17]:
def build_ngram_counter(ngram_sents, right_pad_symbol):
    ngram_dict = dict()
    for i, sent in enumerate(ngram_sents):
        idx = i+1
        if sent in ngram_dict:
            if idx < len(ngram_sents):
                ngram_dict[sent].update([ngram_sents[idx][1]])
            else:
                ngram_dict[sent].update([right_pad_symbol])
        else:
            ngram_dict[sent] = Counter()
            if idx < len(ngram_sents):
                ngram_dict[sent].update([ngram_sents[idx][1]])
            else:
                ngram_dict[sent].update([right_pad_symbol])
    return ngram_dict

In [18]:
trainig_sentences = training_data.clean_text
bigram_sents = get_ngram_sents(trainig_sentences)

In [19]:
bigram_counts = build_ngram_counter(bigram_sents, right_pad_symbol)

In [20]:
bigram_counts

{('eliciting', 'priors'): Counter({'from': 1, 'with': 1}),
 ('priors', 'from'): Counter({'experts': 1, 'same': 1, 'the': 1}),
 ('from', 'experts'): Counter({'is': 1}),
 ('what',
  'is'): Counter({'normality': 1,
          'a': 247,
          'the': 2059,
          'your': 5,
          'an': 62,
          'differenceindifferences': 1,
          'bayes': 1,
          'aic': 1,
          'wrong': 28,
          'regularization': 1,
          'it': 22,
          'my': 8,
          'behind': 2,
          'this': 49,
          'clinical': 1,
          'going': 5,
          'standard': 1,
          'missing': 1,
          'simple': 1,
          'perplexity': 1,
          'theta': 1,
          'heavy': 1,
          'sequential': 2,
          'compound': 1,
          'statistical': 3,
          'best': 5,
          'quantile': 1,
          'conditioning': 1,
          'determining': 1,
          'french': 1,
          'covxy': 2,
          'distribution': 3,
          'randomness': 1,
          

## Text generation function

In [21]:
def generate_text(bigram, ngram_counts):
    most_common_text, _ = ngram_counts[bigram].most_common(1).pop()
    text = ' '.join(bigram)
    return text + ' ' + most_common_text

In [24]:
# Run
generate_text(('some', 'valuable'), bigram_counts)

'some valuable statistical'

In [27]:
# Example
bigram_counts[('some', 'valuable')].most_common(1).pop()

('statistical', 1)

## Prob

In [28]:
def get_prob_of_word(bigram_counter, word):
    """
    counter: bigram counter
    word: token word
    """
    prefix_count = len(bigram_counter.keys())
    word_count = bigram_counter[word]
    return word_count/prefix_count

def get_sent_prob(sent_trigrams, twogram_counts):
    sent_prob = 1
    for trigram in sent_trigrams:
        bigram = (trigram[0], trigram[1])
        bigram_counter = twogram_counts[bigram]
        word = trigram[-1]
        sent_prob *= get_prob_of_word(bigram_counter, word)
    return sent_prob

def get_word_counter(sentences):
    """ return counter from sentences
    """
    c = Counter()
    for sent in sentences:
        c.update(sent.split(' '))
    return c

def get_perplexity(sentence, twogram_counts):
    sent_trigrams = get_ngram_of_sent(sentence, 3)
    sent_prob = get_sent_prob(sent_trigrams, twogram_counts)
    n = len(sentence.split(' '))
    perplexity = np.power(1/sent_prob, 1/n)
    
    return perplexity

## Laplace smoothing

In [32]:
corpus_counter = get_word_counter(trainig_sentences)

In [33]:
corpus_counter

Counter({'eliciting': 2,
         'priors': 150,
         'from': 4810,
         'experts': 12,
         'what': 6516,
         'is': 12567,
         'normality': 366,
         'are': 3757,
         'some': 599,
         'valuable': 5,
         'statistical': 1671,
         'analysis': 3064,
         'open': 35,
         'source': 62,
         'projects': 12,
         'assessing': 94,
         'the': 23255,
         'significance': 881,
         'of': 30508,
         'differences': 503,
         'in': 19780,
         'distributions': 1236,
         'two': 3759,
         'cultures': 1,
         'statistics': 1047,
         'vs': 2009,
         'machine': 877,
         'learning': 1620,
         'locating': 3,
         'freely': 4,
         'available': 96,
         'data': 9208,
         'samples': 991,
         'so': 281,
         'how': 13470,
         'many': 627,
         'staticians': 1,
         'does': 3202,
         'it': 2347,
         'take': 152,
         'to': 21783,
       

In [31]:
def get_laplace_smoothing(bigram_counter, prefix, corpus_counter, delta=0.001):
    prefix_count = len(bigram_counter.keys())
    token_count = bigram_counter[prefix]
    N = len(corpus_counter.keys())
    laplace_score = (token_count + delta)/(prefix_count + (delta * N))

    return laplace_score