# Reproduktion 'Mining the Peanut Gallary'

## Some Helper Functions

In [None]:
from builtins import enumerate, len
import pandas as pd
from ast import literal_eval
from copy import copy

In [None]:
dataset_paths = [ "../data/df_test_1.csv", "../data/df_test_2.csv"]
ngrams = [2,3]
repl_prod_name = [True, False]
tokens_columns = ["tokens_no_rare", "tokens"]

In [None]:
def load_dataset(path):
    df = pd.read_csv(path)

    train_idxes = df.test_idx.unique()
    print(train_idxes)

    res = []

    for train_idx in train_idxes:
        mask = df.test_idx == train_idx

        df_train = pd.read_csv(path)
        df_test = copy(df_train)

        df_train = df_train[~mask]
        df_test = df_test[mask]

        res += [(df_train, df_test)]

    return res




## scoring

In [None]:
from nltk import word_tokenize
import math
from random import random


_REPL_PROD_TITLE_ = "_REPL_PROD_TITLE"
def create_n_grams(tokens, n):
    res = []
    for it in range(0, len(tokens) -(n-1)):
        res.append(" ". join(tokens[it: it +n]))
    return res

def replace_prod_name_in_token(tokens, title_tokens):
    return [_REPL_PROD_TITLE_ if token in title_tokens else token for token in tokens]
        

def replace_prod_names(tokens, prod_titles):
    df_repl = pd.DataFrame(tokens, prod_titles)
    for it , title in enumerate(prod_titles):
        title_tokens = word_tokenize(title)
        tokens[it] = replace_prod_name_in_token(tokens[it], title_tokens)
        
    return pd.Series(tokens)


from nltk.stem.porter import *
ps = PorterStemmer()
def stem_doc(doc):
    return [ps.stem(t) for t in doc]

def score(df_train, n_gram=2, token_column="tokens", repl_prod_name=False, do_stem=False):
    eval_tokens = df_train[token_column].map(literal_eval)
    if repl_prod_name:
        #print("replace prod name")
        eval_tokens = replace_prod_names(eval_tokens.tolist(), df_train.product_title )
    if do_stem:
        eval_tokens = eval_tokens.apply(stem_doc)
    #print("create ngrans")
    ngrams = eval_tokens.apply(lambda row: create_n_grams(row, n_gram))
    #print("score")
    ratings = df_train.bool_rating.tolist()

    pos_total = ratings.count(True)
    neg_total = ratings.count(False)
    
    #print("pos total=", pos_total, " neg_total=", neg_total)
    scoring = {}
    document_frequency = {}


    for it, d in enumerate(ngrams):
        rating = ratings[it]
        for t in list(set(d)):
            try: 
                scoring[t][rating] += 1
            except:
                scoring[t] = {True:0, False:0}
                scoring[t][rating] += 1
            
            try: 
                document_frequency[t] += 1
            except:
                document_frequency[t] = 1
    
   # print("calculate scoring")
    
    for t in scoring:
        c_pos = scoring[t][True]
        c_neg = scoring[t][False]
        tdf = document_frequency[t]
        t_len = len(t)
        
        s_int = 0 if c_pos + c_neg is 0 else (c_pos /(c_pos + c_neg)) - (c_neg / (c_pos + c_neg))
        s_int_not_normalized = c_pos - c_neg
        scoring[t]["s_int"] = s_int
        scoring[t]["s_int_nn"] = s_int_not_normalized
        
        scoring[t]["s_int_df"] = s_int * tdf
        scoring[t]["s_int_df_len"] = s_int * tdf * t_len
        scoring[t]["s_int_log(df)"] = s_int * math.log(tdf)

        scoring[t]["s_int_len_log(df)"] = s_int * t_len * math.log(tdf)
        scoring[t]["s_int_len"] = s_int * t_len

        scoring[t]["s_int_df_log(len)"] = s_int * tdf * math.log(t_len)
        scoring[t]["s_rand"] = random() * 2 -1
    
    return scoring


    

    
# scoring = score(df_train)   

In [None]:
def classify_one(tokens, scoring, scoring_method):
    res = 0
    for t in tokens:
        try:
            res += scoring[t][scoring_method]
        except:
            continue
            print("did not found ",t)
    return res

def classify(df_test, scoring, n_gram=2, token_column="tokens", repl_prod_name=False, scoring_method="int", do_stem=False):
    #print("eval tokens")
    eval_tokens = df_test[token_column].map(literal_eval)
    if repl_prod_name:
        #print("replace prod name")
        eval_tokens = replace_prod_names(eval_tokens.tolist(), df_test.product_title )
    
    if do_stem:
        eval_tokens = eval_tokens.apply(stem_doc)


    #print("create ngrans")
    ngrams = eval_tokens.apply(lambda row: create_n_grams(row, n_gram))
    #print("score")
    bool_rating = df_test.bool_rating.tolist()
    doc_scoring = ngrams.apply(lambda tokens: classify_one(tokens, scoring, scoring_method))
    bool_scoring = doc_scoring.apply(lambda x: x > 0)
    
    #print("check results")
            
    tp, tn, fp, fn = 0,0,0,0
    for it, score in enumerate(bool_scoring.tolist()):
        if score and bool_rating[it]:
            tp += 1
        elif score and not bool_rating[it]:
            fp += 1
        elif not score and bool_rating[it]:
            fn += 1
        elif not score and not bool_rating[it]:
            tn += 1
    #print("pos", bool_scoring.value_counts())
    #print("tp=", tp,"tn=", tn,"fp=", fp,"fn=", fn)
    #print("scoring=", sum([tp, tn])/sum([tp, tn, fp, fn]))
    return sum([tp, tn])/sum([tp, tn, fp, fn]), sum([tp, tn]), sum([fp, fn])
    
#classify(df_test, scoring)




In [None]:
from tqdm import tnrange, tqdm

def test(datasets, n_gram=2, token_col="tokens", repl_prod_name=False, scoring_method="s_int", do_stem=False):
    print("ngram=", n_gram, "token=", token_col, "repl_prod_name", repl_prod_name, "scoring_method", scoring_method, "do_stem", do_stem)
    resses = []
    corrects = []
    wrongs = []
    scorings = []
    for it, dataset in tqdm(enumerate(datasets), desc="iteration"):
        df_train = dataset[0]

        #print("score")
        scoring = score(df_train, n_gram=n_gram, token_column=token_col, repl_prod_name=repl_prod_name, do_stem=do_stem) 
        scorings.append(scoring)
        #print("classify")
    if type(scoring_method) is str:
        print("single scoring")
        for it, dataset in tqdm(enumerate(datasets), desc="iteration"):
            df_test = dataset[1]
            res, correct, wrong = classify(df_test, scoring, n_gram=n_gram, token_column=token_col, do_stem=do_stem, repl_prod_name=repl_prod_name, scoring_method=scoring_method)
            resses.append(res)
            corrects.append(correct)
            wrongs.append(wrong)

        print("total_res=", sum(resses)/len(resses)) 
        print("alt_res=", sum(corrects)/(sum(corrects) + sum(wrongs)))
    
        return sum(resses)/len(resses), sum(corrects)/(sum(corrects) + sum(wrongs))
    else:
        print("list scoring")


        results = {}
        for scoring_method_it in scoring_method:
            print("==== scoring method=", scoring_method_it, "======")
            for it, dataset in tqdm(enumerate(datasets), desc="iteration"):
                df_test = dataset[1]
                res, correct, wrong = classify(df_test, scoring, n_gram=n_gram, token_column=token_col, do_stem=do_stem, repl_prod_name=repl_prod_name, scoring_method=scoring_method_it)
                resses.append(res)
                corrects.append(correct)
                wrongs.append(wrong)
            results[scoring_method_it] = {"res": resses, "correct": corrects, "wrongs": wrongs}

            print("total_res=", sum(resses)/len(resses)) 
            print("alt_res=", sum(corrects)/(sum(corrects) + sum(wrongs)))
        return results, None

## Table 3

In [None]:
dataset = load_dataset(dataset_paths[0])
dataset[0][0].columns

In [None]:
for dataset_path in dataset_paths: 
    print("===== DATASET: ", dataset_path, "=======")
    for token_col in tokens_columns:
        for n in [2,3]:
            for repl_prod_name in [False, True]:
                dataset = load_dataset(dataset_path)
                print("token_col", token_col)
                test(dataset, n_gram=n, token_col=token_col, repl_prod_name=repl_prod_name)


## Table 4

In [None]:
for dataset_path in dataset_paths: 
    print("===== DATASET: ", dataset_path, "=======")
    for do_stem in [True, False]:
        dataset = load_dataset(dataset_path)
        test(dataset, do_stem=do_stem, n_gram=1)



## Table 6

In [None]:
for dataset_path in dataset_paths: 
    print("===== DATASET: ", dataset_path, "=======")

    scoring_methods = ["s_int", "s_int_nn", "s_int_df", "s_int_df_len", "s_int_log(df)", "s_int_len", "s_int_len_log(df)", "s_int_df_log(len)"]
    dataset = load_dataset(dataset_path)
    test(dataset, n_gram=3, token_col="tokens", repl_prod_name=repl_prod_name, scoring_method=scoring_methods)



# Additional Experiment: Comparing n-Grams

In [None]:
for dataset_path in dataset_paths: 
    print("===== DATASET: ", dataset_path, "=======")
    for n in [1,2,3,4,5]:
        dataset = load_dataset(dataset_path)
        test(dataset, n_gram=n)

