# Reproduktion 'Mining the Peanut Gallary'

## Some Helper Functions

In [1]:
from builtins import enumerate, len
import pandas as pd
from ast import literal_eval
from copy import copy

In [2]:
dataset_paths = [ "../data/df_test_1.csv", "../data/df_test_2.csv"]
ngrams = [2,3]
repl_prod_name = [True, False]
tokens = ["common_tokens", "tokens"]

In [3]:
def load_dataset(path):
    df = pd.read_csv(path)

    train_idxes = df.test_idx.unique()
    print(train_idxes)

    res = []

    for train_idx in train_idxes:
        mask = df.test_idx == train_idx

        df_train = pd.read_csv(path)
        df_test = copy(df_train)

        df_train = df_train[~mask]
        df_test = df_test[mask]

        res += [(df_train, df_test)]

    return res




## scoring

In [4]:
from nltk import word_tokenize
import math
from random import random


_REPL_PROD_TITLE_ = "_REPL_PROD_TITLE"
def create_n_grams(tokens, n):
    res = []
    for it in range(0, len(tokens) -(n-1)):
        res.append(" ". join(tokens[it: it +n]))
    return res

def replace_prod_name_in_token(tokens, title_tokens):
    return [_REPL_PROD_TITLE_ if token in title_tokens else token for token in tokens]
        

def replace_prod_names(tokens, prod_titles):
    df_repl = pd.DataFrame(tokens, prod_titles)
    for it , title in enumerate(prod_titles):
        title_tokens = word_tokenize(title)
        tokens[it] = replace_prod_name_in_token(tokens[it], title_tokens)
        
    return pd.Series(tokens)


from nltk.stem.porter import *
ps = PorterStemmer()
def stem_doc(doc):
    return [ps.stem(t) for t in doc]

def score(df_train, n_gram=2, token_column="tokens", repl_prod_name=False, do_stem=False):
    eval_tokens = df_train[token_column].map(literal_eval)
    if repl_prod_name:
        #print("replace prod name")
        eval_tokens = replace_prod_names(eval_tokens.tolist(), df_train.product_title )
    if do_stem:
        eval_tokens = eval_tokens.apply(stem_doc)
    #print("create ngrans")
    ngrams = eval_tokens.apply(lambda row: create_n_grams(row, n_gram))
    #print("score")
    ratings = df_train.bool_rating.tolist()

    pos_total = ratings.count(True)
    neg_total = ratings.count(False)
    
    #print("pos total=", pos_total, " neg_total=", neg_total)
    scoring = {}
    document_frequency = {}


    for it, d in enumerate(ngrams):
        rating = ratings[it]
        for t in list(set(d)):
            try: 
                scoring[t][rating] += 1
            except:
                scoring[t] = {True:0, False:0}
                scoring[t][rating] += 1
            
            try: 
                document_frequency[t] += 1
            except:
                document_frequency[t] = 1
    
   # print("calculate scoring")
    
    for t in scoring:
        c_pos = scoring[t][True]
        c_neg = scoring[t][False]
        tdf = document_frequency[t]
        t_len = len(t)
        
        s_int = 0 if c_pos + c_neg is 0 else (c_pos /(c_pos + c_neg)) - (c_neg / (c_pos + c_neg))
        s_int_not_normalized = c_pos - c_neg
        scoring[t]["s_int"] = s_int
        scoring[t]["s_int_nn"] = s_int_not_normalized
        
        scoring[t]["s_int_df"] = s_int * tdf
        scoring[t]["s_int_df_len"] = s_int * tdf * t_len
        scoring[t]["s_int_log(df)"] = s_int * math.log(tdf)

        scoring[t]["s_int_len_log(df)"] = s_int * t_len * math.log(tdf)
        scoring[t]["s_int_df_log(len)"] = s_int * tdf * math.log(t_len)
        scoring[t]["s_rand"] = random() * 2 -1
    
    return scoring


    

    
# scoring = score(df_train)   

In [5]:
def classify_one(tokens, scoring, scoring_method):
    res = 0
    for t in tokens:
        try:
            res += scoring[t][scoring_method]
        except:
            continue
            print("did not found ",t)
    return res

def classify(df_test, scoring, n_gram=2, token_column="tokens", repl_prod_name=False, scoring_method="int", do_stem=False):
    #print("eval tokens")
    eval_tokens = df_test[token_column].map(literal_eval)
    if repl_prod_name:
        #print("replace prod name")
        eval_tokens = replace_prod_names(eval_tokens.tolist(), df_test.product_title )
    
    if do_stem:
        eval_tokens = eval_tokens.apply(stem_doc)


    #print("create ngrans")
    ngrams = eval_tokens.apply(lambda row: create_n_grams(row, n_gram))
    #print("score")
    bool_rating = df_test.bool_rating.tolist()
    doc_scoring = ngrams.apply(lambda tokens: classify_one(tokens, scoring, scoring_method))
    bool_scoring = doc_scoring.apply(lambda x: x > 0)
    
    #print("check results")
            
    tp, tn, fp, fn = 0,0,0,0
    for it, score in enumerate(bool_scoring.tolist()):
        if score and bool_rating[it]:
            tp += 1
        elif score and not bool_rating[it]:
            fp += 1
        elif not score and bool_rating[it]:
            fn += 1
        elif not score and not bool_rating[it]:
            tn += 1
    #print("pos", bool_scoring.value_counts())
    #print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    #print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    return sum([tp, tn])/sum([tp, tn, fp, fn]), sum([tp, tn]), sum([fp, fn])
    
#classify(df_test, scoring)




In [6]:
from tqdm import tnrange, tqdm

def test(datasets, n_gram=2, token_col="tokens", repl_prod_name=False, scoring_method="s_int", do_stem=False):
    print("ngram=", n_gram, "token=", token_col, "repl_prod_name", repl_prod_name, "scoring_method", scoring_method, "do_stem", do_stem)
    resses = []
    corrects = []
    wrongs = []
    scorings = []
    for it, dataset in tqdm(enumerate(datasets), desc="iteration"):
        df_train = dataset[0]

        #print("score")
        scoring = score(df_train, n_gram=n_gram, token_column=token_col, repl_prod_name=repl_prod_name, do_stem=do_stem) 
        scorings.append(scoring)
        #print("classify")
    if type(scoring_method) is str:
        print("single scoring")
        for it, dataset in tqdm(enumerate(datasets), desc="iteration"):
            df_test = dataset[1]
            res, correct, wrong = classify(df_test, scoring, n_gram=n_gram, token_column=token_col, do_stem=do_stem, repl_prod_name=repl_prod_name, scoring_method=scoring_method)
            resses.append(res)
            corrects.append(correct)
            wrongs.append(wrong)

        print("total_res=", sum(resses)/len(resses)) 
        print("alt_res=", sum(corrects)/(sum(corrects) + sum(wrongs)))
    
        return sum(resses)/len(resses), sum(corrects)/(sum(corrects) + sum(wrongs))
    else:
        print("list scoring")


        results = {}
        for scoring_method_it in scoring_method:
            print("==== scoring method=", scoring_method_it, "======")
            for it, dataset in tqdm(enumerate(datasets), desc="iteration"):
                df_test = dataset[1]
                res, correct, wrong = classify(df_test, scoring, n_gram=n_gram, token_column=token_col, do_stem=do_stem, repl_prod_name=repl_prod_name, scoring_method=scoring_method_it)
                resses.append(res)
                corrects.append(correct)
                wrongs.append(wrong)
            results[scoring_method_it] = {"res": resses, "correct": corrects, "wrongs": wrongs}

            print("total_res=", sum(resses)/len(resses)) 
            print("alt_res=", sum(corrects)/(sum(corrects) + sum(wrongs)))
        return results, None

## Table 3

In [7]:
for dataset_path in dataset_paths: 
    print("===== DATASET: ", dataset_path, "=======")
    for token_col in ["tokens", "common_tokens"]:
        for n in [2,3]:
            for repl_prod_name in [False, True]:
                dataset = load_dataset(dataset_path)
                print("token_col", token_col)
                test(dataset, n_gram=n, token_col=token_col, repl_prod_name=repl_prod_name)


[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

token_col tokens
ngram= 2 token= tokens repl_prod_name False scoring_method s_int do_stem False


iteration: 7it [00:08,  1.19s/it]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 7it [00:01,  6.28it/s]


total_res= 0.9580188456674507
alt_res= 0.959705734826952
[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

token_col tokens
ngram= 2 token= tokens repl_prod_name True scoring_method s_int do_stem False


iteration: 7it [00:28,  4.18s/it]
iteration: 1it [00:00,  9.02it/s]

single scoring


iteration: 7it [00:03,  2.32it/s]


total_res= 0.9595632988575821
alt_res= 0.960040127069052
[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

token_col tokens
ngram= 3 token= tokens repl_prod_name False scoring_method s_int do_stem False


iteration: 7it [00:09,  1.38s/it]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 7it [00:01,  6.08it/s]


total_res= 0.9785270629991126
alt_res= 0.979769269352951
[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

token_col tokens
ngram= 3 token= tokens repl_prod_name True scoring_method s_int do_stem False


iteration: 7it [00:30,  4.38s/it]
iteration: 1it [00:00,  9.27it/s]

single scoring


iteration: 7it [00:03,  2.32it/s]


total_res= 0.9790594498669033
alt_res= 0.980270857716101
[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

token_col common_tokens
ngram= 2 token= common_tokens repl_prod_name False scoring_method s_int do_stem False


iteration: 7it [00:07,  1.10s/it]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 7it [00:01,  5.96it/s]


total_res= 0.9213329413015868
alt_res= 0.9160675472329042
[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

token_col common_tokens
ngram= 2 token= common_tokens repl_prod_name True scoring_method s_int do_stem False


iteration: 7it [00:28,  4.06s/it]
iteration: 1it [00:00,  8.56it/s]

single scoring


iteration: 7it [00:03,  2.42it/s]


total_res= 0.9214860208676924
alt_res= 0.9162347433539542
[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

token_col common_tokens
ngram= 3 token= common_tokens repl_prod_name False scoring_method s_int do_stem False


iteration: 7it [00:09,  1.33s/it]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 7it [00:01,  6.27it/s]


total_res= 0.977284826974268
alt_res= 0.9785988965056011
[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

token_col common_tokens
ngram= 3 token= common_tokens repl_prod_name True scoring_method s_int do_stem False


iteration: 7it [00:29,  4.28s/it]
iteration: 1it [00:00,  8.63it/s]

single scoring


iteration: 7it [00:03,  2.32it/s]


total_res= 0.977639751552795
alt_res= 0.978933288747701
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

token_col tokens
ngram= 2 token= tokens repl_prod_name False scoring_method s_int do_stem False


iteration: 10it [00:08,  1.15it/s]
iteration: 2it [00:00, 12.20it/s]

single scoring


iteration: 10it [00:00, 12.55it/s]


total_res= 0.9982142857142857
alt_res= 0.9982142857142857
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

token_col tokens
ngram= 2 token= tokens repl_prod_name True scoring_method s_int do_stem False


iteration: 10it [00:31,  3.09s/it]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 10it [00:02,  3.49it/s]


total_res= 0.9982142857142857
alt_res= 0.9982142857142857
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

token_col tokens
ngram= 3 token= tokens repl_prod_name False scoring_method s_int do_stem False


iteration: 10it [00:10,  1.02s/it]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 10it [00:00, 11.16it/s]


total_res= 0.9984375
alt_res= 0.9984375
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

token_col tokens
ngram= 3 token= tokens repl_prod_name True scoring_method s_int do_stem False


iteration: 10it [00:32,  3.25s/it]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 10it [00:02,  3.32it/s]


total_res= 0.9970982142857142
alt_res= 0.9970982142857143
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

token_col common_tokens
ngram= 2 token= common_tokens repl_prod_name False scoring_method s_int do_stem False


iteration: 10it [00:08,  1.22it/s]
iteration: 2it [00:00, 14.04it/s]

single scoring


iteration: 10it [00:00, 13.02it/s]


total_res= 0.9973214285714287
alt_res= 0.9973214285714286
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

token_col common_tokens
ngram= 2 token= common_tokens repl_prod_name True scoring_method s_int do_stem False


iteration: 10it [00:30,  3.01s/it]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 10it [00:02,  3.42it/s]


total_res= 0.9970982142857142
alt_res= 0.9970982142857143
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

token_col common_tokens
ngram= 3 token= common_tokens repl_prod_name False scoring_method s_int do_stem False


iteration: 10it [00:09,  1.00s/it]
iteration: 2it [00:00, 13.24it/s]

single scoring


iteration: 10it [00:00, 12.33it/s]


total_res= 0.9984375
alt_res= 0.9984375
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

token_col common_tokens
ngram= 3 token= common_tokens repl_prod_name True scoring_method s_int do_stem False


iteration: 10it [00:32,  3.18s/it]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 10it [00:02,  3.26it/s]

total_res= 0.9970982142857142
alt_res= 0.9970982142857143





## Table 4

In [8]:
for dataset_path in dataset_paths: 
    print("===== DATASET: ", dataset_path, "=======")
    for do_stem in [True, False]:
        dataset = load_dataset(dataset_path)
        test(dataset, do_stem=do_stem, n_gram=1)



[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

ngram= 1 token= tokens repl_prod_name False scoring_method s_int do_stem True


iteration: 7it [00:48,  7.18s/it]
iteration: 1it [00:00,  6.19it/s]

single scoring


iteration: 7it [00:07,  1.07it/s]


total_res= 0.8391676478991427
alt_res= 0.8373181742183581
[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

ngram= 1 token= tokens repl_prod_name False scoring_method s_int do_stem False


iteration: 7it [00:06,  1.07it/s]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 7it [00:00,  7.41it/s]


total_res= 0.8393075667462113
alt_res= 0.8374853703394082
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

ngram= 1 token= tokens repl_prod_name False scoring_method s_int do_stem True


iteration: 10it [00:49,  4.95s/it]
iteration: 0it [00:00, ?it/s]

single scoring


iteration: 10it [00:05,  1.75it/s]


total_res= 0.9613839285714285
alt_res= 0.9613839285714286
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

ngram= 1 token= tokens repl_prod_name False scoring_method s_int do_stem False


iteration: 10it [00:06,  1.53it/s]
iteration: 2it [00:00, 15.71it/s]

single scoring


iteration: 10it [00:00, 14.73it/s]

total_res= 0.9665178571428571
alt_res= 0.9665178571428571





## Table 6

In [9]:
for dataset_path in dataset_paths: 
    print("===== DATASET: ", dataset_path, "=======")

    scoring_methods = ["s_int", "s_rand", "s_int_df", "s_int_df_len", "s_int_log(df)", "s_int_len_log(df)", "s_int_df_log(len)"]
    dataset = load_dataset(dataset_path)
    test(dataset, n_gram=3, token_col="tokens", repl_prod_name=repl_prod_name, scoring_method=scoring_methods)



[0 1 2 3 4 5 6]


iteration: 0it [00:00, ?it/s]

ngram= 3 token= tokens repl_prod_name True scoring_method ['s_int', 's_rand', 's_int_df', 's_int_df_len', 's_int_log(df)', 's_int_len_log(df)', 's_int_df_log(len)'] do_stem False


iteration: 7it [00:29,  4.34s/it]
iteration: 1it [00:00,  8.62it/s]

list scoring


iteration: 7it [00:03,  2.23it/s]
iteration: 1it [00:00,  9.46it/s]

total_res= 0.9790594498669033
alt_res= 0.980270857716101


iteration: 7it [00:03,  2.34it/s]
iteration: 1it [00:00,  9.61it/s]

total_res= 0.7394713373615807
alt_res= 0.739257649222538


iteration: 7it [00:03,  2.32it/s]
iteration: 1it [00:00,  9.37it/s]

total_res= 0.8073841250715336
alt_res= 0.8095636181240595


iteration: 7it [00:03,  2.35it/s]
iteration: 1it [00:00,  9.42it/s]

total_res= 0.8425213493084811
alt_res= 0.8457197793011202


iteration: 7it [00:03,  2.30it/s]
iteration: 1it [00:00,  9.23it/s]

total_res= 0.8646958967396712
alt_res= 0.8686841665273366


iteration: 7it [00:03,  2.35it/s]
iteration: 1it [00:00,  9.50it/s]

total_res= 0.8802618177933679
alt_res= 0.8845232123948058


iteration: 7it [00:03,  2.31it/s]


total_res= 0.8895237661311943
alt_res= 0.8941409702152053
[0 1 2 3 4 5 6 7 8 9]


iteration: 0it [00:00, ?it/s]

ngram= 3 token= tokens repl_prod_name True scoring_method ['s_int', 's_rand', 's_int_df', 's_int_df_len', 's_int_log(df)', 's_int_len_log(df)', 's_int_df_log(len)'] do_stem False


iteration: 10it [00:32,  3.22s/it]
iteration: 0it [00:00, ?it/s]

list scoring


iteration: 10it [00:02,  3.44it/s]
iteration: 0it [00:00, ?it/s]

total_res= 0.9970982142857142
alt_res= 0.9970982142857143


iteration: 10it [00:02,  3.44it/s]
iteration: 0it [00:00, ?it/s]

total_res= 0.7431919642857142
alt_res= 0.7431919642857143


iteration: 10it [00:02,  3.34it/s]
iteration: 0it [00:00, ?it/s]

total_res= 0.8235119047619047
alt_res= 0.8235119047619047


iteration: 10it [00:02,  3.43it/s]
iteration: 0it [00:00, ?it/s]

total_res= 0.8647321428571428
alt_res= 0.8647321428571428


iteration: 10it [00:02,  3.42it/s]
iteration: 0it [00:00, ?it/s]

total_res= 0.8871875
alt_res= 0.8871875


iteration: 10it [00:02,  3.44it/s]
iteration: 0it [00:00, ?it/s]

total_res= 0.9019717261904762
alt_res= 0.9019717261904762


iteration: 10it [00:02,  3.41it/s]


total_res= 0.9140943877551019
alt_res= 0.914094387755102
