In [None]:
!python3 -m pip install transformers
!python3 -m pip install seaborn
!python3 -m pip install statsmodels
!python3 -m pip install tensorflow==1.4.1
!python3 -m pip install torch
!python3 -m pip install nltk
import nltk
nltk.download('punkt')

In [None]:
nltk.download('averaged_perceptron_tagger')

# Dependencies

In [None]:
# Riemer
from mle import Mandelbrot
import tensorflow as tf
import torch
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Rachel
import numpy as np
import pandas as pd
import random
import string
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.optimize import minimize
import math
import json
from nltk import tokenize
import collections
import re
import sys
import itertools
import time
import nltk
from scipy.stats import mannwhitneyu

import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel,\
        GenericLikelihoodModelResults

from statsmodels.nonparametric.smoothers_lowess import lowess

from scipy.special import zeta
from scipy.stats import binom

import pickle
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

lg = np.log10

from scipy.stats import chisquare

# Model Init

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-small")

# add the EOS token as PAD token to avoid warnings
model = GPT2LMHeadModel.from_pretrained("gpt2-small", pad_token_id=tokenizer.eos_token_id)

model.to("cuda")

# Load datasets

In [None]:
gpt_set = pickle.load(open("datasets/gpt_set.p", "rb" ))

In [None]:
human_set = pickle.load(open("datasets/human_set.p", "rb" ))

In [None]:
Wiki = open("datasets/Wiki.txt", "r").read()

wiki_set = Wiki.split('</doc>')[:-1]

In [None]:
GPT = open("datasets/GPT-2.txt", "r").read()

GPT_set = GPT.split('</doc>')[:-1]

# Preprocessing

In [None]:
# Pre-processing without part of speech tags
def remove_punctuation(text):
    text = text.lower()
    chars_to_remove = "[\n]!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    tr = str.maketrans(" ", " ", chars_to_remove)
    return text.translate(tr)


def preprocess(corpus, sent = True):
    if sent:
        corpus = tokenize.sent_tokenize(corpus)
        corpus = [remove_punctuation(sent).split() for sent in corpus]
    else:
        corpus = remove_punctuation(corpus).split()
    return corpus

In [None]:
# Pre-processing with part of speech tags
def part_of_speech(corpus):
    corpus = tokenize.sent_tokenize(corpus)
    chars_to_remove = "[\n]"
    tr = str.maketrans(" ", " ", chars_to_remove)
    chars_to_remove2 = "[\n]!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    new_corp = []
    test = []

    for sent in corpus:
        sent = sent.translate(tr)
        words_sent = tokenize.word_tokenize(sent)
        sent_pos = nltk.pos_tag(words_sent)
        new_sent = []
        for (word, pos) in sent_pos:
            tr2 = str.maketrans("", "", chars_to_remove2)
            word = word.translate(tr2)
            if word:
                new_sent.append((word.lower(), pos))
        new_corp.append(new_sent)
    return new_corp

In [None]:
# Total preprocessing function for a corpus. Input can be one string (corpus),
# for which you put multi = False, or a list of several strings (corpora) that 
# you want to turn into one big corpus, for which you put multi = True.
# For PoS tags, put pos = True.
def make_file(corp, multi = True, sent = True, pos = False):
    if multi:
        corpus = ''
        for subcorp in corp:
            corpus += subcorp
    else:
        corpus = corp
        
    if pos:
        corpus = part_of_speech(corpus)
    
    else:
        corpus = preprocess(corpus, sent = sent)
    
    return corpus

# Rachel Code

## Subsampling

In [None]:
# Returns 2 lists of corpora, one from which the ranks will be calculated
# and one from which the frequencies will be calculated. Each corpus consists of
# a list of tokenized sentences.
# Input: corpus that is to be subsampled. Should be a list of tokenized sentences.
# k is the amount of tokens that each sampled corpus should contain,
# m is the amount of subcorpera you want for both the ranks and frequencies.
# Max: I would read Valentin's thesis for an explanation on subsampling
def subsampling(corpus, k = 1000000, m = 10, sent = True):
    n = len(corpus)
    
    sen_len = {}

    
    rank_corpera = []
    freq_corpera = []

    if sent:
        for i in range(m):
            used_rank = set()
            used_freq = set()
            rank_count = 0
            freq_count = 0
            rank_samples = []
            freq_samples = []

            while rank_count < k:
                index = np.random.randint(n)
                if index in used_rank:
                    continue

                rank_sample = corpus[index]
                len_sample = len(rank_sample)

                if len_sample == 0:
                    continue

                if rank_count > k:
                    max_len = len_sample - (rank_count - k)
                    rank_sample = rank_sample[:max_len]
                    
                rank_samples += rank_sample
                rank_count += len_sample


                used_rank.add(index)

            while freq_count < k:
                index = np.random.randint(n)
                if index in used_freq:
                    continue
                freq_sample = corpus[index]
                len_sample = len(freq_sample)

                if len_sample == 0:
                    continue
                    
                if freq_count > k:
                    max_len = len_sample - (freq_count - k)
                    freq_sample = freq_sample[:max_len]

                freq_samples += freq_sample
                freq_count += len_sample

                if len_sample not in sen_len and len_sample < 200:
                    sen_len[len_sample] = 1
                elif len_sample < 200:
                    sen_len[len_sample] += 1

                used_freq.add(index)

            rank_corpera.append(rank_samples)
            freq_corpera.append(freq_samples)
#                 rank_corpera.append([item for sublist in rank_samples for item in sublist])
#                 freq_corpera.append([item for sublist in freq_samples for item in sublist])


    else:
        for i in range(m):
            rank_samples = random.sample(corpus, k)
            freq_samples = random.sample(corpus, k)
            rank_corpera.append(rank_samples)
            freq_corpera.append(freq_samples)
    
#     return rank_corpera, freq_corpera, sen_len
    return rank_corpera, freq_corpera

In [None]:
# Returns a dataframe of word frequencies for list of corpora,
# with each column corresponding to a different corpus.
# Input: list of corpora. Each corpus consists of a list of tokenized sentences.
def calculate_freqs(freq_sents, norm=True, text=None):
    freq_dict = {}
    norm_dict = {}
    for i, corpus in enumerate(freq_sents):
        freq_dict['{} c_frequency {}'.format(text,i)] = collections.Counter(corpus)
        if norm:
            len_corp = len(corpus)
            norm_dict['{} c_frequency {}'.format(text, i)] = {k: v / len_corp for k, v in freq_dict['{} c_frequency {}'.format(text,i)].items()}
    
    if norm:
        freqs_df = pd.DataFrame(norm_dict)
    else:
        freqs_df = pd.DataFrame(freq_dict)
    freqs_df = freqs_df.fillna(0)
    
    
    return freqs_df

In [None]:
# Returns a dataframe with the mean frequency of each word across different corpora.
# Input: frequency dataframe
def mean_freqs(freqs_df):
    return(freqs_df.mean(axis=1))

In [None]:
# Returns a dataframe of word ranks for list of corpora,
# with each column corresponding to a different corpus.
# Input: list of corpora. Each corpus consists of a list of tokenized sentences.
def calculate_ranks(rank_sents, norm=False, text=None):
    ranks_dicts = {}
    for i, corpus in enumerate(rank_sents):
        freqs = collections.Counter(corpus)
        if norm:
            len_corp = len(corpus)
            for key in freqs:
                freqs[key] /= len_corp
        ranks_dicts['{} c_rank {}'.format(text, i)] = {w: r for r, (w, c) in enumerate(freqs.most_common(), 1)}
    
    ranks_df = pd.DataFrame(ranks_dicts)
    for column in ranks_df:
        min_rank = int(np.ceil(ranks_df[column].max() + 1))
        nan_rows = ranks_df[ranks_df[column].isnull()]
        num_nans = len(nan_rows)
        nan_ranks = list(range(min_rank, min_rank+num_nans))
        random.shuffle(nan_ranks)
        ranks_df.loc[ranks_df[column].isnull(), column] = nan_ranks

    return ranks_df

In [None]:
# Returns a dataframe with the mean rank of each word across different corpora.
# Input: rank dataframe
def mean_ranks(ranks_df):
    return ranks_df.mean(axis=1)

In [None]:
# Creates combined dataframe of ranks and frequencies
# Input: 2 lists (freq_sents and rank_sents) of corpora. Each corpus
# consists of a list of tokenized sentences. These lists are to be obtained form
# subsampling.
def ranks_freqs(freq_sents, rank_sents, text=None, norm=False):
    freqs_df = calculate_freqs(freq_sents, text=text, norm=norm)
    freqs_df['Frequency'] = mean_freqs(freqs_df)
    ranks_df = calculate_ranks(rank_sents, text=text, norm=norm)
    ranks_df['Rank'] = mean_ranks(ranks_df)
    
    # Put mean ranks and freqs together and remove all words that
    # do not have both a rank and frequency (which happens when a word)
    # is only present in freq_sents and not in rank_sents or vice versa
    ranks_freqs_df = pd.concat([ranks_df, freqs_df], axis = 1)
    ranks_freqs_df = ranks_freqs_df.dropna()
#     ranks_freqs_df = ranks_freqs_df.loc[ranks_freqs_df['Frequency'] >=1]
    return ranks_freqs_df

## Zipf's law (param estimation)

In [None]:
# Returns a dataframe containing the mean frequencies and ranks, as well as 
# the estimated frequencies from Zipf's law and the error between the (log) mean
# frequencies and (log) estimated frequencies.
def zipfs_law(df, print_stats = True):
    mandelbrot = Mandelbrot(df['Frequency'], df['Rank'])
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), # [1.0, 1.0]
                                method="powell", full_output=True, disp=0)
    mandelbrot.register_fit(mandelbrot_fit)
    if print_stats:
        mandelbrot.print_result()
    
    model_params = mandelbrot.optim_params
    alpha, beta =  mandelbrot.optim_params
    preds = mandelbrot.predict(model_params, df['Rank'])

    df['Estimated frequency'] = preds
    return df, [alpha, beta]

## Plot Zipf

In [None]:
def plot_zipf(ranks_freqs_df):
    ranks_freqs_df = ranks_freqs_df.sort_values(by=['Rank'])
    zipf_df, params = zipfs_law(ranks_freqs_df)
#     ranks_freqs_df = ranks_freqs_df.loc[ranks_freqs_df['Frequency'] >=1]
#     hexbin_plot(ranks_freqs_df['Rank'], ranks_freqs_df['Frequency'], est = ranks_freqs_df['Estimated frequency'])
#     plt.show()
#     hexbin_error(zipf_df['Rank (log)'], zipf_df['Error'])
#     plt.show()
    
    return zipf_df

In [None]:
# Divides a big corpus into "n" subcorpera and calculates the frequencies for each
# subcorpus. Returns a dataframe containing the frequencies by word and by rank.
def sample_corpora(corpus, text, n=10, norm=True, subclasses=False, pos=True,size=10):
    corpus = [item for sublist in corpus for item in sublist]
    rank_corp, freq_corp = subsampling(corpus, k=size*100, m=n)

    if len(rank_corp[0]) == 0:
      print(corpus)

    by_rank = pd.DataFrame()
    by_word = pd.DataFrame()

    ranks_freqs_df = ranks_freqs(rank_corp, freq_corp, text=text, norm=norm)
    ranks_freqs_df, params = zipfs_law(ranks_freqs_df, print_stats=False)
    
    return  params
#     return None

In [None]:
# Takes 2 corpora and aligns their frequency values by specific words and ranks 
# so that the Mann-Whitney test can be applied to the frequencies of every word
# or rank.
def mann_whitney_df(corpus, human, gpt, size, n=10, t=0, norm=True, subclasses=False):
    params_c = sample_corpora(corpus, text="C1", n=n, norm=norm, subclasses=subclasses,size=size)
    params_h = sample_corpora(human, text="C2", n=n, norm=norm, subclasses=subclasses,size=size)
    params_g = sample_corpora(gpt, text="C2", n=n, norm=norm, subclasses=subclasses,size=size)
    
    
    dist_h = np.linalg.norm(np.array(params_c)-np.array(params_h))
    dist_g = np.linalg.norm(np.array(params_c)-np.array(params_g))
    
    return [dist_h, dist_g] 

In [None]:
# Takes 2 corpora, and applies the Mann-Whitney procedure to "times" subparts
# of both corpora. 
# Returns dataframes containing distributions of the total percentages as well 
# as per-rank percentages of rejected H0 ranks and words.
def stats_dist2(corpus, human, gpt, times=10, n=10, t=0, norm=True, subclasses=False, pos=False):
    len_corp = int(len(human)/10)
    dists = []
    
    for i in range(times):
        corpus_samp = corpus
        human_samp = human[i*len_corp:(i+1)*len_corp]
        gpt_samp = gpt[i*len_corp:(i+1)*len_corp]
        dist = mann_whitney_df(corpus, human_samp, gpt_samp,size=len_corp, n=n, t=t, norm=norm, subclasses=subclasses)
        dists.append(dist)
   
    return dists

# Test (Zipf params)

In [None]:
def test(corpus, size, rep = 1, times = 10, n=10, sub=True):
  
    dist_human = 0
    dist_machine = 0

    corpus = corpus
    human = human_set
    random.shuffle(human)
    gpt = gpt_set
    random.shuffle(gpt)
    dists = stats_dist2(corpus, human[0:size*times], gpt[0:size*times], times=times, n=n)

    if dists[0] < dists[1]:
      print("corpus is human")
      return True
    else:
      print("corpus is gpt-2")
      return False


In [None]:
test(human_set[0:10], 10)

In [None]:
test(wiki_set[0:10], 10)

In [None]:
corpus = make_file(wiki_set[0:10], multi=True, pos=True)

test(corpus, 10)

# Generator

## Prompt maker

In [None]:
def make_prompt(article, begin=True):
  
    article_sents = nltk.tokenize.sent_tokenize(article)
    sent_num = len(article_sents)

    if begin is True:
      if sent_num >= 3:
        return ' '.join(article_sents[:3])
      if sent_num >= 2:
        return ' '.join(article_sents[:2])
      else:
        return article_sents[0]
    elif sent_num >= 3:
      return ' '.join(article_sents[-3:])
    elif sent_num >= 2:
      return ' '.join(article_sents[-2:])
    else:
      return article_sents[-1]

## Section Generator

In [None]:
def generate_section(section, min_tokens, k, p, t, rep_pen, n_gram, seed):
    
    prompt = make_prompt(section, begin=True)
    prompt_len = len(prompt.split())
        
    in_ids = tokenizer.encode(prompt, return_tensors='pt')
    in_ids = in_ids.to("cuda")
    
    curr_id_len = len(in_ids[0])
    max_len = min_len = curr_id_len+128

    out = []
    
    while len(out) <= min_tokens:
        
        torch.manual_seed(seed)
            
        out_ids = model.generate(
            in_ids,
            do_sample=True,
            max_length=max_len,
            min_length=min_len,
            top_k=k,
            top_p=p,
            temperature=t,
            repetition_penalty=rep_pen,
            no_repeat_ngram_size=n_gram
        )
        
        output = tokenizer.decode(out_ids[0], skip_special_tokens=True).split()      
        output_no_prompt = output[prompt_len:]
        out.extend(output_no_prompt)
        
        output_full = " ".join(output)
        prompt = make_prompt(output_full, begin=False)
        prompt_len = len(prompt.split())
                
        in_ids = tokenizer.encode(prompt, return_tensors='pt')
        in_ids = in_ids.to("cuda")
        
        # Error control
        if len(in_ids[0]) == curr_id_len+128:
            
            print('in_id error')
            return False
            
            out_sents = nltk.tokenize.sent_tokenize(" ".join(out))
            if len(out_sents) > 1:
                new_sents = " ".join(out_sents[:-2])
            else:
                new_sents = " ".join(out_sents)

            out = new_sents.split()
            
            prompt = make_prompt(new_sents, begin=False)
            prompt_len = len(prompt.split())
            
            in_ids = tokenizer.encode(prompt, return_tensors='pt')
            in_ids = in_ids.to("cuda")
            
        curr_id_len = len(in_ids[0])
        max_len = min_len = curr_id_len+128
    
    return out

In [None]:
def generate(txt, k, p, t, rep_pen, n_gram, seed):
    
    new_tokens = []

    sections = txt.split('\n\n')
    for section in sections:
        sec_len = len(section.split())
        if sec_len > 10:
            
            curr_tokens = generate_section(
                section, 
                min_tokens=sec_len, 
                k=k, 
                p=p, 
                t=t, 
                rep_pen=rep_pen, 
                n_gram=n_gram,
                seed=seed
            )
            
            if curr_tokens is False:
                return False
            
            new_tokens.extend(curr_tokens[:sec_len] + ["\n\n"])

    new_txt = " ".join(new_tokens)
    return new_txt

# Rejection Sampling

### Zipf parameters Human set (1.000 token set)

In [None]:
corpus = make_file(wiki_set[0:10], multi=True, pos=False)
  
rank_corp, freq_corp = subsampling(corpus, k=100, m=10)
ranks_freqs_df = ranks_freqs(rank_corp, freq_corp, text=None, norm=None)
ranks_freqs_df, params = zipfs_law(ranks_freqs_df, print_stats=False)

def probability(txt, params):
  corpus = make_file(txt, multi=False, pos=False)
  rank, freq = subsampling(corpus, k=10, m=10)
  mandel = Mandelbrot(freq, rank)
  mandelbrot_fit = mandel.fit(start_params=np.asarray([1.0, 1.0]),
                                method="powell", full_output=True)
  mandel.register_fit(mandelbrot_fit)
  return mandel.loglike(params, frequencies=freq, ranks=rank)

In [None]:
def rejection(txt):

  i = 0
  sections = txt.split('\n\n')
  new_txt = []

  for txt in wiki_set:
    seeds = np.linspace(0, , num=101)
    while i <= 20:
      sec_len = len(section.split())
      if sec_len > 10:
        curr_snippet = generate_section(section, min_tokens=sec_len, p=0.95, n_gram=3, seed=seed)

        if prob(curr_snippet, params) >= treshold:
          new_tokens.extend(curr_snippet[:sec_len] + ["\n\n"])
          break

        else:
          seed_list.append(seed)
          seed += 1
          i += 1

  return " ".join(new_tokens)

In [None]:
def generate_section(section, min_tokens, k, p, t, rep_pen, n_gram, seed):
    
    prompt = make_prompt(section, begin=True)
    prompt_len = len(prompt.split())
        
    in_ids = tokenizer.encode(prompt, return_tensors='pt')
    in_ids = in_ids.to("cuda")
    
    curr_id_len = len(in_ids[0])
    max_len = min_len = curr_id_len+128

    out = []
    
    while len(out) <= min_tokens:
        
        torch.manual_seed(seed)
            
        out_ids = model.generate(
            in_ids,
            do_sample=True,
            max_length=max_len,
            min_length=min_len,
            top_k=k,
            top_p=p,
            temperature=t,
            repetition_penalty=rep_pen,
            no_repeat_ngram_size=n_gram
        )
        
        output = tokenizer.decode(out_ids[0], skip_special_tokens=True).split()      
        output_no_prompt = output[prompt_len:]
        out.extend(output_no_prompt)
        
        output_full = " ".join(output)
        prompt = make_prompt(output_full, begin=False)
        prompt_len = len(prompt.split())
                
        in_ids = tokenizer.encode(prompt, return_tensors='pt')
        in_ids = in_ids.to("cuda")
        
        # Error control
        if len(in_ids[0]) == curr_id_len+128:
            
            print('in_id error')
            
            out_sents = nltk.tokenize.sent_tokenize(" ".join(out))
            if len(out_sents) > 1:
                new_sents = " ".join(out_sents[:-2])
            else:
                new_sents = " ".join(out_sents)

            out = new_sents.split()
            
            prompt = make_prompt(new_sents, begin=False)
            prompt_len = len(prompt.split())
            
            in_ids = tokenizer.encode(prompt, return_tensors='pt')
            in_ids = in_ids.to("cuda")
            
        curr_id_len = len(in_ids[0])
        max_len = min_len = curr_id_len+128
    
    return out

In [None]:
def generate(txt, k, p, t, rep_pen, n_gram, seed):
    
    new_tokens = []

    sections = txt.split('\n\n')
    for section in sections:
        sec_len = len(section.split())
        if sec_len > 10:
            
            curr_tokens = generate_section(
                section, 
                min_tokens=sec_len, 
                k=k, 
                p=p, 
                t=t, 
                rep_pen=rep_pen, 
                n_gram=n_gram,
                seed=seed
            )
            
            new_tokens.extend(curr_tokens[:sec_len] + ["\n\n"])

    new_txt = " ".join(new_tokens)
    return new_txt

In [None]:
%%time

corpus = []

def rejection(txt):
  seeds = np.linspace(0, 20, num=21)
  for seed in seeds:
    curr_txt = generate(txt, k=None, p=0.95, t=1, rep_pen=1.0, n_gram=3, seed=seed)
    if test(curr_txt, 1):
      return curr_txt
  return False

def test_batch(texts):
  corpus = make_file(texts, multi=True, pos=False)
  return test(corpus, 10)

texts = []
for txt in wiki_set:
  sample = rejection(txt)
  if sample is not False:
    texts.append(sample)
  if len(texts) == 10:
    if test_batch(texts):
      print(texts)
      break
    else:
      texts = []