<a href="https://colab.research.google.com/github/Willyoung2017/Team_Hotpot_11777/blob/main/Data-Analysis/TextVQA_TextAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [20]:
import json
import csv
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sb
import spacy

In [21]:
# Load English language pretrained model from spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
# Load json data
# Note: may need to edit the file paths
with open('/content/drive/MyDrive/11777/TextVQA_0.5.1_val.json') as val, \
    open('/content/drive/MyDrive/11777/TextVQA_0.5.1_train.json') as train, \
    open('/content/drive/MyDrive/11777/TextVQA_0.5.1_test.json') as test:
    val_list = json.load(val)
    train_list = json.load(train)
    test_list = json.load(test)

In [4]:
# Convert json data to pandas dfs
val_data = pd.json_normalize(val_list, record_path='data')
train_data = pd.json_normalize(train_list, record_path='data')
test_data = pd.json_normalize(test_list, record_path='data')

In [5]:
# val_data.info()

In [13]:
# For this analysis, select text-related columns from the _data dfs to create _text dfs
val_text = val_data[['question_id', 'question', 'question_tokens', 'answers', 'set_name']]
train_text = train_data[['question_id', 'question', 'question_tokens', 'answers', 'set_name']]
test_text = test_data[['question_id', 'question', 'question_tokens', 'set_name']]

In [None]:
# Add a column on question length, and another column that caps question length at 20
for text_df in [val_text, train_text, test_text]:
    text_df['question_len'] = text_df.question_tokens.apply(len)
    text_df['capped_question_len'] = text_df.apply(lambda x: x['question_len'] if x['question_len'] <= 20 else 20, axis=1)

In [None]:
# Add a column on the number of answers for each question
val_text['answer_num'] = val_text.answers.apply(len)
train_text['answer_num'] = train_text.answers.apply(len)

In [None]:
# This cell and the next confirm that each question has 10 answers
val_text.describe()

In [None]:
train_text.describe()

In [22]:
# Convert the question column to a list of docs
val_questions = list(nlp.pipe(val_data.question))
train_questions = list(nlp.pipe(train_data.question))
test_questions = list(nlp.pipe(test_data.question))

In [None]:
# type(val_questions[0])

In [18]:
# Convert the answers column to a df with 10 columns, containing answers only
# The expanded_ dfs have only one column that contains all answers
val_10_answers = val_text["answers"].apply(pd.Series)
expanded_val_answers = val_10_answers.melt()
train_10_answers = train_text["answers"].apply(pd.Series)
expanded_train_answers = train_10_answers.melt()

In [None]:
# This function converts an entire df to a list of docs 
def df_to_docs(df):
    docs = []
    for name, series in df.items():
        docs = docs + list(nlp.pipe(series))
    return docs

In [None]:
# Convert each answers df to a list of docs
val_answers = df_to_docs(val_10_answers)
train_answers = df_to_docs(train_10_answers)

In [None]:
# print(len(val_answers))
# print(val_10_answers.shape)

In [None]:
# This function extracts the token data of interest from a doc
def extract_tokens_plus_meta(doc):
    return [
        (i.text, i.i, i.pos_) for i in doc
    ]

In [None]:
# This function uses a list of docs to tabulate token data info of interest to a df
def tab_token_data(docs):
    """Extract tokens and metadata from list of spaCy docs."""
    
    cols = [
        "doc_id", "token", "token_order", "pos"
    ]
    
    meta_df = []
    for ix, doc in enumerate(docs):
        meta = extract_tokens_plus_meta(doc)
        meta = pd.DataFrame(meta)
        meta.columns = cols[1:]
        meta = meta.assign(doc_id = ix).loc[:, cols]
        meta_df.append(meta)
        
    return pd.concat(meta_df)  

In [None]:
# Create dfs that contain question/ answer token data of interest
val_q_token_data = tab_token_data(val_questions)
train_q_token_data = tab_token_data(train_questions)
test_q_token_data = tab_token_data(test_questions)
val_a_token_data = tab_token_data(val_answers)
train_a_token_data = tab_token_data(train_answers)

In [None]:
# val_q_token_data.query("pos != 'PUNCT'").pos.value_counts()

In [None]:
# For each dataset, analyze questions and create graphs and metrics files
# Specifically, question length, the most frequent questions, question starting word, and the most frequent word classes in questions

def analyze_questions(text_df, token_data, prefix, path):
    
    avg_question_len = text_df['question_len'].mean()
    median_question_len = text_df['question_len'].median()
    std_dev_question_len = text_df['question_len'].std()
    with open(path + prefix + 'question_metrics.txt', 'w') as metrics_file:
        tsv_writer = csv.writer(metrics_file, delimiter='\t')
        tsv_writer.writerow(['avg question length: ' + "%.2f" % avg_question_len])
        tsv_writer.writerow(['median question length: ' + "%.2f" % median_question_len])
        tsv_writer.writerow(['std dev of question length: ' + "%.2f" % std_dev_question_len])
    
    plt.clf()
    text_df.capped_question_len.hist(figsize=(14, 7), range=(0, 20), color="red", alpha=.4, bins=20)
    plt.savefig(path + prefix + 'question_length.png')

    plt.clf()
    text_df.question.value_counts().head(10).plot(kind="barh", figsize=(30, 14), color='green', alpha=.7)
    plt.yticks(fontsize=15)
    plt.xticks(fontsize=15)
    plt.savefig(path + prefix + '10_most_frequent_questions.png')
    
    plt.clf()
    first_word = token_data[token_data['token_order'] == 0].token.value_counts()
    first_word = first_word / first_word.sum()
    ax = first_word.head(10).plot(kind="barh", figsize=(24, 14), alpha=.7)
    ax.invert_yaxis()
    plt.savefig(path + prefix + 'distribution_of_10_most_frequent_question_starting_words.png')
    
    plt.clf()
    word_token = token_data.query("pos != 'PUNCT'").pos.value_counts()
    word_token = word_token / word_token.sum()
    ax = word_token.head(10).plot(kind="barh", figsize=(24, 14), color='orange', alpha=.7)
    ax.invert_yaxis()
    plt.savefig(path + prefix + 'distribution_of_10_most_frequent_word_classes_in_questions.png')
    
    
# Note: may need to edit the file path
path = '/content/drive/MyDrive/11777/'
analyze_questions(val_text, val_q_token_data, 'val_', path)
analyze_questions(train_text, train_q_token_data, 'train_', path)
analyze_questions(test_text, test_q_token_data, 'test_', path)

In [None]:
# For each dataset, analyze answers and create graphs and metrics files
# Specifically, answer length, the most frequent answers, and the most frequent word classes in answers

def analyze_answers(expanded_answers, text_df, token_data, prefix, path):
    
    avg_answer_len = token_data.groupby('doc_id').size().mean()
    median_answer_len = token_data.groupby('doc_id').size().median()
    std_dev_answer_len = token_data.groupby('doc_id').size().std()
    with open(path + prefix + 'answer_metrics.txt', 'w') as metrics_file:
        tsv_writer = csv.writer(metrics_file, delimiter='\t')
        tsv_writer.writerow(['avg answer length: ' + "%.2f" % avg_answer_len])
        tsv_writer.writerow(['median answer length: ' + "%.2f" % median_answer_len])
        tsv_writer.writerow(['std dev of answer length: ' + "%.2f" % std_dev_answer_len])

    plt.clf()
    counts = expanded_val_answers.value.value_counts()
    counts = counts / 10
    counts.head(10).plot(kind="barh", figsize=(32, 14), color='green', alpha=.7)
    plt.yticks(fontsize=10)
    plt.xticks(fontsize=10)
    plt.savefig(path + prefix + '10_most_frequent_answers.png')
    
    plt.clf()
    word_token = token_data.query("pos != 'PUNCT'").pos.value_counts()
    word_token = word_token / word_token.sum()
    ax = word_token.head(10).plot(kind="barh", figsize=(24, 14), color='orange', alpha=.7)
    ax.invert_yaxis()
    plt.savefig(path + prefix + 'distribution_of_10_most_frequent_word_classes_in_answers.png')
    
    
# Note: may need to edit the file path
path = '/content/drive/MyDrive/11777/'
analyze_answers(expanded_val_answers, val_text, val_a_token_data, 'val_', path)
analyze_answers(expanded_train_answers, train_text, train_a_token_data, 'train_', path)

In [None]:
# spacy.explain('PART')