In [1]:
import importlib
import text_rank
importlib.reload(text_rank)
import os
import string
from tqdm.notebook import tqdm

In [2]:
def evaluate_keywords(predicted_keywords, labeled_keywords):
    tp = 0
    fp = 0
    fn = 0
    labeled_keywords_lower = [keyword.lower().strip() for keyword in labeled_keywords]
    predicted_keywords_lower = [keyword.lower().strip() for keyword in predicted_keywords]
    for keyword in predicted_keywords_lower:
        if keyword in labeled_keywords_lower:
            tp += 1
        else:
            fp += 1
    for keyword in labeled_keywords:
        if not keyword in predicted_keywords_lower:
            fn += 1
    return tp, fp, fn

def read_file(file_path):
    with open(file_path, 'r') as f:
        text = f.read()
        printable = set(string.printable)
        text = ''.join(filter(lambda x: x in printable, text))
        return text

In [3]:
abstracts_dir = 'Inspec/docsutf8'
keywords_dir = 'Inspec/keys'

tokenizer = text_rank.ArticleTokenizer()

def run_config_single(abstract_path, keywords_path, co_occurances, directed, max_keywords, max_keyword_length):
    abstract = read_file(abstract_path)
    labeled_keywords_raw = read_file(keywords_path)
    labeled_keywords = [keyword.strip() for keyword in labeled_keywords_raw.split('\n') if bool(keyword.strip())]

    
    candidate_keywords = tokenizer.extract_keywords(abstract, 1)
    tagged_candidate_keywords = tokenizer.tag_keywords(candidate_keywords)
    words_list = tokenizer.extract_words(abstract)

    word_graph = text_rank.WordGraph(candidate_keywords, words_list, max_distance=co_occurances, directed=directed)
    word_graph.calculate_pagerank()

    predicted_keywords = word_graph.postprocess_keywords_chronological(max_keywords, max_keyword_length)

    return predicted_keywords, labeled_keywords

def test_config(co_occurances, directed, max_keywords, max_keyword_length):
    tp = 0
    fp = 0
    fn = 0
    file_names = [file_path.split('.')[0] for file_path in os.listdir(keywords_dir)]
    for file_name in tqdm(file_names):
        abstract_path = os.path.join(abstracts_dir, file_name + '.txt')
        keywords_path = os.path.join(keywords_dir, file_name + '.key')

        predicted_keywords, labeled_keywords = run_config_single(abstract_path, keywords_path, co_occurances, directed, max_keywords, max_keyword_length)
        local_tp, local_fp, local_fn = evaluate_keywords(predicted_keywords, labeled_keywords)

        tp += local_tp
        fp += local_fp
        fn += local_fn
    return tp, fp, fn

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iraha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\iraha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iraha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iraha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [84]:
tunable_params = {
    'co_occurances': [2, 3, 5, 10],
    'directed': [0, 1],
    'max_keywords': [2, 3, 4, 5, 10, 15, 20],
    'max_keyword_length': [3]
}
def generate_results_list(tunable_params, idx):
    tmp_list = []
    keys = list(tunable_params.keys())
    for i in range(len(tunable_params[keys[idx]])):
        if idx + 1 < len(tunable_params.keys()):
            tmp_list.append(generate_results_list(tunable_params, idx + 1))
    return tmp_list

results = generate_results_list(tunable_params, 0)
for i, co_occurances in enumerate(tunable_params['co_occurances']):
    for j, directed in enumerate(tunable_params['directed']):
        for k, max_keywords in enumerate(tunable_params['max_keywords']):
            for l, max_keyword_length in enumerate(tunable_params['max_keyword_length']):
                tp, fp, fn = test_config(co_occurances, directed, max_keywords, max_keyword_length)
                results[i][j][k].append({
                    'co_occurances': co_occurances,
                    'directed': directed,
                    'max_keywords': max_keywords,
                    'max_keyword_length': max_keyword_length,
                    'results': {
                        'TP': tp,
                        'FP': fp,
                        'FN': fn
                    }
                })

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

In [85]:
print(results)

[[[[{'co_occurances': 2, 'directed': 0, 'max_keywords': 2, 'max_keyword_length': 3, 'results': {'TP': 1329, 'FP': 7786, 'FN': 26980}}], [{'co_occurances': 2, 'directed': 0, 'max_keywords': 3, 'max_keyword_length': 3, 'results': {'TP': 1695, 'FP': 10334, 'FN': 26644}}], [{'co_occurances': 2, 'directed': 0, 'max_keywords': 4, 'max_keyword_length': 3, 'results': {'TP': 2039, 'FP': 12438, 'FN': 26329}}], [{'co_occurances': 2, 'directed': 0, 'max_keywords': 5, 'max_keyword_length': 3, 'results': {'TP': 2278, 'FP': 14320, 'FN': 26108}}], [{'co_occurances': 2, 'directed': 0, 'max_keywords': 10, 'max_keyword_length': 3, 'results': {'TP': 3265, 'FP': 21198, 'FN': 25180}}], [{'co_occurances': 2, 'directed': 0, 'max_keywords': 15, 'max_keyword_length': 3, 'results': {'TP': 4076, 'FP': 26036, 'FN': 24427}}], [{'co_occurances': 2, 'directed': 0, 'max_keywords': 20, 'max_keyword_length': 3, 'results': {'TP': 4790, 'FP': 30194, 'FN': 23761}}]], [[{'co_occurances': 2, 'directed': 1, 'max_keywords': 2,

In [15]:
import pandas as pd
import numpy as np

In [87]:
results_array = np.array(results).flatten()
results_array

array([{'co_occurances': 2, 'directed': 0, 'max_keywords': 2, 'max_keyword_length': 3, 'results': {'TP': 1329, 'FP': 7786, 'FN': 26980}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 3, 'max_keyword_length': 3, 'results': {'TP': 1695, 'FP': 10334, 'FN': 26644}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 4, 'max_keyword_length': 3, 'results': {'TP': 2039, 'FP': 12438, 'FN': 26329}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 5, 'max_keyword_length': 3, 'results': {'TP': 2278, 'FP': 14320, 'FN': 26108}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 10, 'max_keyword_length': 3, 'results': {'TP': 3265, 'FP': 21198, 'FN': 25180}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 15, 'max_keyword_length': 3, 'results': {'TP': 4076, 'FP': 26036, 'FN': 24427}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 20, 'max_keyword_length': 3, 'results': {'TP': 4790, 'FP': 30194, 'FN': 23761}},
       {'co_occurances': 

In [88]:
df = pd.DataFrame(columns=['Co-Occurances', 'Directed Graph', 'Top N Keywords', 'Max Keyword Phrase Length', 'True Positives', 'False Positives', 'False Negatives', 'Precision', 'Recall', 'F-Score'])
for i, result in enumerate(results_array):
    if result['directed']:
        directed = 'Yes'
    else:
        directed = 'No'
    precision = result['results']['TP'] / (result['results']['TP'] + result['results']['FP'])
    recall = result['results']['TP'] / (result['results']['TP'] + result['results']['FN'])
    f_score = 2 * precision * recall / (precision + recall)
    df.loc[i] = [
        result['co_occurances'],
        directed,
        result['max_keywords'],
        result['max_keyword_length'],
        result['results']['TP'],
        result['results']['FP'],
        result['results']['FN'],
        precision * 100,
        recall * 100,
        f_score
    ]
df

Unnamed: 0,Co-Occurances,Directed Graph,Top N Keywords,Max Keyword Phrase Length,True Positives,False Positives,False Negatives,Precision,Recall,F-Score
0,2,No,2,3,1329,7786,26980,14.580362,4.69462,0.071024
1,2,No,3,3,1695,10334,26644,14.090947,5.981157,0.083977
2,2,No,4,3,2039,12438,26329,14.08441,7.187676,0.09518
3,2,No,5,3,2278,14320,26108,13.724545,8.025083,0.10128
4,2,No,10,3,3265,21198,25180,13.346687,11.478291,0.123422
5,2,No,15,3,4076,26036,24427,13.536132,14.300249,0.139077
6,2,No,20,3,4790,30194,23761,13.691973,16.776996,0.150783
7,2,Yes,2,3,1229,5957,27117,17.1027,4.335709,0.069177
8,2,Yes,3,3,1677,7873,26706,17.560209,5.908466,0.088419
9,2,Yes,4,3,2038,9683,26391,17.387595,7.168736,0.101519


In [89]:
df.to_excel("no_filter.xlsx") 

In [90]:
def test_config_adj_noun(co_occurances, directed, max_keywords, max_keyword_length):
    tp = 0
    fp = 0
    fn = 0
    file_names = [file_path.split('.')[0] for file_path in os.listdir(keywords_dir)]
    for file_name in tqdm(file_names):
        abstract_path = os.path.join(abstracts_dir, file_name + '.txt')
        keywords_path = os.path.join(keywords_dir, file_name + '.key')

        predicted_keywords, labeled_keywords = run_config_single(abstract_path, keywords_path, co_occurances, directed, max_keywords, max_keyword_length)
        tagged_keywords = tokenizer.tag_keywords(predicted_keywords)
        predicted_keywords = list(set([tagged_keyword[0] for tagged_keyword in tagged_keywords if (tagged_keyword[1][0] == 'N' or tagged_keyword[1][0] == 'J') and len(tagged_keyword[0]) > 1]))
        
        local_tp, local_fp, local_fn = evaluate_keywords(predicted_keywords, labeled_keywords)

        tp += local_tp
        fp += local_fp
        fn += local_fn
    return tp, fp, fn

results = generate_results_list(tunable_params, 0)
for i, co_occurances in enumerate(tunable_params['co_occurances']):
    for j, directed in enumerate(tunable_params['directed']):
        for k, max_keywords in enumerate(tunable_params['max_keywords']):
            for l, max_keyword_length in enumerate(tunable_params['max_keyword_length']):
                tp, fp, fn = test_config_adj_noun(co_occurances, directed, max_keywords, max_keyword_length)
                results[i][j][k].append({
                    'co_occurances': co_occurances,
                    'directed': directed,
                    'max_keywords': max_keywords,
                    'max_keyword_length': max_keyword_length,
                    'results': {
                        'TP': tp,
                        'FP': fp,
                        'FN': fn
                    }
                })

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

In [91]:
results_array = np.array(results).flatten()
results_array

array([{'co_occurances': 2, 'directed': 0, 'max_keywords': 2, 'max_keyword_length': 3, 'results': {'TP': 1151, 'FP': 6023, 'FN': 27149}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 3, 'max_keyword_length': 3, 'results': {'TP': 1450, 'FP': 7847, 'FN': 26882}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 4, 'max_keyword_length': 3, 'results': {'TP': 1710, 'FP': 9368, 'FN': 26651}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 5, 'max_keyword_length': 3, 'results': {'TP': 1897, 'FP': 10720, 'FN': 26484}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 10, 'max_keyword_length': 3, 'results': {'TP': 2726, 'FP': 15747, 'FN': 25703}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 15, 'max_keyword_length': 3, 'results': {'TP': 3361, 'FP': 19264, 'FN': 25137}},
       {'co_occurances': 2, 'directed': 0, 'max_keywords': 20, 'max_keyword_length': 3, 'results': {'TP': 3970, 'FP': 22348, 'FN': 24572}},
       {'co_occurances': 2,

In [92]:
df = pd.DataFrame(columns=['Co-Occurances', 'Directed Graph', 'Top N Keywords', 'Max Keyword Phrase Length', 'True Positives', 'False Positives', 'False Negatives', 'Precision', 'Recall', 'F-Score'])
for i, result in enumerate(results_array):
    if result['directed']:
        directed = 'Yes'
    else:
        directed = 'No'
    precision = result['results']['TP'] / (result['results']['TP'] + result['results']['FP'])
    recall = result['results']['TP'] / (result['results']['TP'] + result['results']['FN'])
    f_score = 2 * precision * recall / (precision + recall)
    df.loc[i] = [
        result['co_occurances'],
        directed,
        result['max_keywords'],
        result['max_keyword_length'],
        result['results']['TP'],
        result['results']['FP'],
        result['results']['FN'],
        precision * 100,
        recall * 100,
        f_score
    ]
df

Unnamed: 0,Co-Occurances,Directed Graph,Top N Keywords,Max Keyword Phrase Length,True Positives,False Positives,False Negatives,Precision,Recall,F-Score
0,2,No,2,3,1151,6023,27149,16.044048,4.067138,0.064893
1,2,No,3,3,1450,7847,26882,15.596429,5.117888,0.077068
2,2,No,4,3,1710,9368,26651,15.435999,6.029407,0.086716
3,2,No,5,3,1897,10720,26484,15.03527,6.684049,0.092541
4,2,No,10,3,2726,15747,25703,14.756672,9.5888,0.116242
5,2,No,15,3,3361,19264,25137,14.855249,11.79381,0.131487
6,2,No,20,3,3970,22348,24572,15.084733,13.909327,0.144732
7,2,Yes,2,3,1061,4694,27269,18.436142,3.745146,0.062256
8,2,Yes,3,3,1440,6104,26928,19.088017,5.076142,0.080196
9,2,Yes,4,3,1736,7434,26681,18.931298,6.109019,0.092372


In [93]:
df.to_excel("filter.xlsx") 

In [12]:
def run_config_single_damp(abstract_path, keywords_path, co_occurances, directed, max_keywords, max_keyword_length, damp_factor):
    abstract = read_file(abstract_path)
    labeled_keywords_raw = read_file(keywords_path)
    labeled_keywords = [keyword.strip() for keyword in labeled_keywords_raw.split('\n') if bool(keyword.strip())]

    
    candidate_keywords = tokenizer.extract_keywords(abstract, 1)
    tagged_candidate_keywords = tokenizer.tag_keywords(candidate_keywords)
    words_list = tokenizer.extract_words(abstract)

    word_graph = text_rank.WordGraph(candidate_keywords, words_list, max_distance=co_occurances, directed=directed)
    word_graph.calculate_pagerank(damp_factor=damp_factor)

    predicted_keywords = word_graph.postprocess_keywords_chronological(max_keywords, max_keyword_length)

    return predicted_keywords, labeled_keywords

def test_config_adj_noun(co_occurances, directed, max_keywords, max_keyword_length, damp_factor):
    tp = 0
    fp = 0
    fn = 0
    file_names = [file_path.split('.')[0] for file_path in os.listdir(keywords_dir)]
    for file_name in tqdm(file_names):
        abstract_path = os.path.join(abstracts_dir, file_name + '.txt')
        keywords_path = os.path.join(keywords_dir, file_name + '.key')

        predicted_keywords, labeled_keywords = run_config_single_damp(abstract_path, keywords_path, co_occurances, directed, max_keywords, max_keyword_length, damp_factor)
        tagged_keywords = tokenizer.tag_keywords(predicted_keywords)
        predicted_keywords = list(set([tagged_keyword[0] for tagged_keyword in tagged_keywords if (tagged_keyword[1][0] == 'N' or tagged_keyword[1][0] == 'J') and len(tagged_keyword[0]) > 1]))
        
        local_tp, local_fp, local_fn = evaluate_keywords(predicted_keywords, labeled_keywords)

        tp += local_tp
        fp += local_fp
        fn += local_fn
    return tp, fp, fn

results = []
for i in tqdm(range(0, 11)):
    tp, fp, fn = test_config_adj_noun(2, 1, 3, 3, float(i)/100 + 0.8)
    results.append({
        'co_occurances': 2,
        'directed': 'Yes',
        'max_keywords': 3,
        'max_keyword_length': 3,
        'damp_factor': float(i)/100 + 0.8,
        'results': {
            'TP': tp,
            'FP': fp,
            'FN': fn
        }
    })

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

In [13]:
results

[{'co_occurances': 2,
  'directed': 'Yes',
  'max_keywords': 3,
  'max_keyword_length': 3,
  'damp_factor': 0.8,
  'results': {'TP': 1442, 'FP': 6199, 'FN': 26921}},
 {'co_occurances': 2,
  'directed': 'Yes',
  'max_keywords': 3,
  'max_keyword_length': 3,
  'damp_factor': 0.9,
  'results': {'TP': 1444, 'FP': 6168, 'FN': 26921}},
 {'co_occurances': 2,
  'directed': 'Yes',
  'max_keywords': 3,
  'max_keyword_length': 3,
  'damp_factor': 1.0,
  'results': {'TP': 1443, 'FP': 6152, 'FN': 26928}},
 {'co_occurances': 2,
  'directed': 'Yes',
  'max_keywords': 3,
  'max_keyword_length': 3,
  'damp_factor': 1.1,
  'results': {'TP': 1433, 'FP': 6137, 'FN': 26934}},
 {'co_occurances': 2,
  'directed': 'Yes',
  'max_keywords': 3,
  'max_keyword_length': 3,
  'damp_factor': 1.2000000000000002,
  'results': {'TP': 1437, 'FP': 6132, 'FN': 26932}},
 {'co_occurances': 2,
  'directed': 'Yes',
  'max_keywords': 3,
  'max_keyword_length': 3,
  'damp_factor': 1.3,
  'results': {'TP': 1433, 'FP': 6108, 'FN'

In [17]:
df = pd.DataFrame(columns=['Co-Occurances', 'Directed Graph', 'Top N Keywords', 'Max Keyword Phrase Length', 'Damp Factor', 'True Positives', 'False Positives', 'False Negatives', 'Precision', 'Recall', 'F-Score'])
results_array = np.array(results).flatten()
for i, result in enumerate(results_array):
    if result['directed']:
        directed = 'Yes'
    else:
        directed = 'No'
    precision = result['results']['TP'] / (result['results']['TP'] + result['results']['FP'])
    recall = result['results']['TP'] / (result['results']['TP'] + result['results']['FN'])
    f_score = 2 * precision * recall / (precision + recall)
    df.loc[i] = [
        result['co_occurances'],
        directed,
        result['max_keywords'],
        result['max_keyword_length'],
        result['damp_factor'],
        result['results']['TP'],
        result['results']['FP'],
        result['results']['FN'],
        precision * 100,
        recall * 100,
        f_score
    ]
df

Unnamed: 0,Co-Occurances,Directed Graph,Top N Keywords,Max Keyword Phrase Length,Damp Factor,True Positives,False Positives,False Negatives,Precision,Recall,F-Score
0,2,Yes,3,3,0.8,1442,6199,26921,18.871875,5.084088,0.080102
1,2,Yes,3,3,0.9,1444,6168,26921,18.970047,5.090781,0.080274
2,2,Yes,3,3,1.0,1443,6152,26928,18.999342,5.08618,0.080242
3,2,Yes,3,3,1.1,1433,6137,26934,18.929987,5.051645,0.079751
4,2,Yes,3,3,1.2,1437,6132,26932,18.985335,5.065388,0.079971
5,2,Yes,3,3,1.3,1433,6108,26937,19.002785,5.05111,0.079808
6,2,Yes,3,3,1.4,1421,6103,26941,18.886231,5.010225,0.079195
7,2,Yes,3,3,1.5,1430,6093,26938,19.008374,5.040891,0.079686
8,2,Yes,3,3,1.6,1421,6063,26943,18.987173,5.009872,0.079279
9,2,Yes,3,3,1.7,1429,6044,26942,19.122173,5.036833,0.079734


In [18]:
df.to_excel("damp.xlsx") 