In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kenlm

%matplotlib notebook


In [2]:
PATH_TRAIN_J = '../../../../data/raw/Oscar_data/PP/pp_train_adult_2634.json'
PATH_TRAIN_T = '../../../../data/raw/Oscar_data/PP/pp_train_adult_2634.txt'
PATH_VAL_J = '../../../../data/raw/Oscar_data/PP/pp_val_adult_829.json'
PATH_TEST_J = '../../../../data/raw/Oscar_data/PP/pp_test_adult_829.json'

PATH_MODEL_MINI = '../../../../models/English/PP_approach/kenlm_mini.binary'


In [3]:
def pp(log_score, length):
    return 10.0 ** (-log_score / length)

In [4]:
def get_pp (df):
    ls_pp = []
    for i, text in enumerate(df['text']):
        log_score = kenlm_mini.score(text, bos = True, eos = True)
        n = len(text.split())

        if n==0:
            ls_pp.append(0)
        else:
            ls_pp.append(pp(log_score, n))
    df['pp_score'] = ls_pp
    return (df)

In [5]:
def distributions_pp (df):
    print('---- 1 -----')
    display(df[df['annotation']==1].describe().transpose().round(2))
    print('\n---- 0 -----')
    display(df[df['annotation']==0].describe().transpose().round(2))


    sns.displot(df[['pp_score', 'annotation']], x="pp_score", hue="annotation", kind="kde", palette="PRGn")
    plt.show()
    
    plt.figure()
    sns.boxplot(x='annotation', y='pp_score', data=df[['annotation', 'pp_score']], palette="PRGn")
    plt.show()

In [6]:
df_val = pd.read_json(PATH_VAL_J)
df_val.reset_index(inplace=True, drop=True)
df_test = pd.read_json(PATH_TEST_J)
df_test.reset_index(inplace=True, drop=True)


In [7]:
kenlm_mini = kenlm.Model(PATH_MODEL_MINI)

In [8]:
df_val = get_pp (df_val)
display(df_val.head())
distributions_pp (df_val)

Unnamed: 0,text,annotation,pp_score
0,#ubuntu-qc 2011-09-05\n<qwebirc43563> BOnjours...,0,136306.340204
1,Anime is a phenomenon that has taken off like ...,1,1024.710015
2,Vasoconstrictive effects of human post-hemorrh...,0,9790.022533
3,Numerous infants born to women consuming foods...,0,8104.352443
4,A nonsurgical approach to low back pain.\nLow ...,0,6850.865305


---- 1 -----


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
annotation,329.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
pp_score,329.0,1965.68,2031.81,1.47,862.51,1588.82,2604.13,18392.81



---- 0 -----


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
annotation,500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pp_score,500.0,47053.96,133169.4,0.0,2960.16,6041.88,15494.73,1071815.65


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
df_test = get_pp (df_test)
display(df_test.head())
distributions_pp (df_test)

Unnamed: 0,text,annotation,pp_score
0,1. Field of the Invention\nThe present inventi...,0,4607.550898
1,247CoupleCams.com – One place for the collecti...,1,4.828216
2,Epidemia de gripe (debate) \nPresidente\nSegue...,0,223389.604669
3,The most random thing happened to me on Sunday...,1,1954.448474
4,Gustavo Cañete\n\nGustavo Cañete (born April 4...,0,171762.800343


---- 1 -----


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
annotation,329.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
pp_score,329.0,1738.33,2319.46,1.36,629.1,1314.29,2304.01,33576.6



---- 0 -----


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
annotation,500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pp_score,500.0,114947.77,834583.6,0.0,3161.08,6438.12,17113.67,17481780.34


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>