### Overview of the Generated QA Dataset
This notebook provides an overview of the data that is included in the generated QA dataset. 

#### Columns of the Dataset

In [1]:
import os
import pandas as pd

dataset_path = os.path.join(os.getcwd(), 'full', 'deep_distributed_graph_dataset.csv')
dataset_df = pd.read_csv(dataset_path)

dataset_df.columns.tolist()

['uid',
 'question',
 'golden_answer',
 'source_ids',
 'golden_doc_chunks',
 'golden_triples',
 'is_generated_with',
 'topic_entity_id',
 'topic_entity_value',
 'hops',
 'based_on_template',
 'use_case',
 'retrieval_operation',
 'semi-typed',
 'updated template',
 'graph_representation',
 'answer_format',
 'answer_type',
 'condition_type',
 'used_in_reduced',
 'notes']

#### Distribution of Semi-Typed Questions

In [3]:
typed_counts = dataset_df['semi-typed'].value_counts()
typed_counts

semi-typed
True     87
False    83
Name: count, dtype: int64

#### Distribution of Use Cases

In [4]:
use_case_counts = dataset_df['use_case'].value_counts()
use_case_counts = use_case_counts.sort_index()
use_case_counts

use_case
1    24
2    24
3    32
4    28
5    33
6    29
Name: count, dtype: int64

#### Distribution of Retrieval Operations

In [5]:
retrieval_operation_counts = dataset_df['retrieval_operation'].value_counts()
retrieval_operation_counts

retrieval_operation
aggregation     24
counting        24
ranking         24
comparative     24
relationship    24
basic           18
negation        16
superlative     16
Name: count, dtype: int64

#### Distribution of Use Case to Retrieval Operation Pairs

In [6]:
use_case_and_retrieval_operation_counts = dataset_df.groupby(['use_case', 'retrieval_operation']).size().reset_index(name='count')
use_case_and_retrieval_operation_counts = use_case_and_retrieval_operation_counts.sort_values(by=['use_case', 'retrieval_operation'])

use_case_and_retrieval_operation_matrix = use_case_and_retrieval_operation_counts.pivot(index='retrieval_operation', columns='use_case', values='count').fillna(0)
use_case_and_retrieval_operation_matrix = use_case_and_retrieval_operation_matrix.astype(int)
use_case_and_retrieval_operation_matrix

use_case,1,2,3,4,5,6
retrieval_operation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
aggregation,4,4,4,4,4,4
basic,4,4,4,1,4,1
comparative,4,4,4,4,4,4
counting,4,4,4,4,4,4
negation,0,0,4,4,4,4
ranking,4,4,4,4,4,4
relationship,4,4,4,4,4,4
superlative,0,0,4,3,5,4


#### Distribution of Condition Type - Taxonomy Category

In [7]:
condition_type_counts = dataset_df['condition_type'].value_counts()
condition_type_counts

condition_type
named entity              133
named entity, temporal     37
Name: count, dtype: int64

#### Distribution of Answer Format - Taxonomy Category

In [8]:
answer_format_counts = dataset_df['answer_format'].value_counts()
answer_format_counts

answer_format
enumerative    61
simple         58
explanatory    51
Name: count, dtype: int64

#### Distribution of Graph Representation - Taxonomy Category

In [9]:
graph_representation_counts = dataset_df['graph_representation'].value_counts()
graph_representation_counts

graph_representation
multi fact     152
single fact     18
Name: count, dtype: int64

#### Distribution of Answer Type - Taxonomy Category

In [10]:
answer_type_counts = dataset_df['answer_type'].value_counts()
answer_type_counts

answer_type
named entity                           86
description, quantitative              24
quantitative                           20
description, quantitative, temporal    13
named entity, temporal                  9
description, named entity, temporal     8
description, named entity               6
other                                   2
boolean                                 2
Name: count, dtype: int64

#### Distribution of Hops Required

In [11]:
hop_amount_counts = dataset_df['hops'].value_counts()
hop_amount_counts = hop_amount_counts.sort_index()
hop_amount_counts

hops
1      1
2      6
3     24
4      5
5     33
6    101
Name: count, dtype: int64