# CompSem material
Notebook to keep at hand all CompSem related functions that we might need for our final project.
To try this functions we have to use the Uni's JupyterHub server, so they won't work here. This is just a reference place to keep all useful code, preferably with the example outputs taken from executing it in the server. 

Proposed workflow:
* Make sure that you are using the latest version of this file
* Upload this file to the server
* Edit, add and run whatever new material you found
* Download the file and save it here, deleting the previous version. In case maaany changes were done then leave the previous version but add the current date to its name, to know that it's a deprecated version of the file

## Setup

In [None]:
# imports

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from __future__ import division
import codecs
import json
from itertools import chain, permutations, combinations
from collections import Counter, defaultdict
import configparser
import os
import random
from textwrap import fill
import scipy
import sys
from copy import deepcopy
import spacy
from annoy import AnnoyIndex
from nltk.parse import CoreNLPParser
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

from IPython.display import Latex, display

pd.set_option('max_colwidth', 250)

In [None]:
# Load up config file (needs path; adapt env var if necessary); local imports

# load config file, set up paths, make project-specific imports
config_path = os.environ.get('VISCONF')
if not config_path:
    # try default location, if not in environment
    default_path_to_config = '../../clp-vision/Config/default.cfg'
    if os.path.isfile(default_path_to_config):
        config_path = default_path_to_config

assert config_path is not None, 'You need to specify the path to the config file via environment variable VISCONF.'        

config = configparser.ConfigParser()
with codecs.open(config_path, 'r', encoding='utf-8') as f:
    config.read_file(f)

corpora_base = config.get('DEFAULT', 'corpora_base')
preproc_path = config.get('DSGV-PATHS', 'preproc_path')
dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')


sys.path.append(dsgv_home + '/Utils')
from utils import icorpus_code, plot_labelled_bb, get_image_filename, query_by_id
from utils import plot_img_cropped, plot_img_ax, invert_dict, get_a_by_b
sys.path.append(dsgv_home + '/WACs/WAC_Utils')
from wac_utils import create_word2den, is_relational
sys.path.append(dsgv_home + '/Preproc')
from sim_preproc import load_imsim, n_most_sim

sys.path.append('../Common')
from data_utils import load_dfs, plot_rel_by_relid, get_obj_bb, compute_distance_objs
from data_utils import get_obj_key, compute_relpos_relargs_row, get_all_predicate
from data_utils import compute_distance_relargs_row, get_rel_type, get_rel_instances
from data_utils import compute_obj_sizes_row

### Selecting which DataFrames to use

In [None]:
# Load up preprocessed DataFrames. Slow!
# These DataFrames are the result of pre-processing the original corpus data,
# as per dsg-vision/Preprocessing/preproc.py

df_names = [
#             'saiapr_bbdf',
#            'mscoco_bbdf',
#            'mscoco_catsdf',
#            'cococapdf',
#             'vgregdf',
#             'vgimgdf',
#             'vgreldf',
#             'vgobjdf',
             'vgpardf' 
#             'flickr_bbdf',
#             'flickr_capdf',
#             'flickr_objdf',
#             'ade_imgdf',
#             'ade_objdf',
#             'cub_bbdf',
#             'cub_attrdf',
#             'cub_partdf',
#             'cub_capdf'
           ]
df = load_dfs(preproc_path, df_names)

# a derived DF, containing only those region descriptions which I was able to resolve
# df['vgpregdf'] = df['vgregdf'][df['vgregdf']['pphrase'].notnull() & 
#                                (df['vgregdf']['pphrase'] != '')]

# Corpus analysis
Getting an idea of NL/V characteristics of the corpus. Sizes, distributions, general statistics, etc

In [None]:
current_df = df['vgpardf']

### Description

In [None]:
current_df.head()

In [None]:
current_df.info()

### Language Corpus

In [None]:
full_sentences = np.asarray(df.paragraph)
print(len(full_sentences))

In [None]:
#tokenizing our sentences
tokenized_sentences = [nltk.tokenize.word_tokenize(sen.lower()) for sen in full_sentences]

In [None]:
#Avg sentence length
sent_lens = [len(sentence) for sentence in tokenized_sentences]
mean = np.mean(sent_lens)
median = np.median(sent_lens)

fig = plt.figure(figsize=(10,5))
ax = sns.distplot(sent_lens, norm_hist=False, kde=False)
ax.set(xlabel='sentence length', ylabel='')

plt.axvline(mean,color='r', linestyle='--')
plt.axvline(median,color='g', linestyle='--')

plt.legend({f'Mean = {mean.round()}':mean,f'Median = {median.round()}':median})


plt.show()

In [None]:
#Word and POS tag freq analysis
words = list(chain.from_iterable(tokenized_sentences))
fdist = nltk.FreqDist(words)

In [None]:
#Hapaxes
hapaxes = fdist.hapaxes()
len(hapaxes)

In [None]:
pd.DataFrame([random.sample(hapaxes, 10), random.sample(hapaxes, 10), random.sample(hapaxes, 10), random.sample(hapaxes, 10)])

In [None]:
#getting POS tags
pos_sents = [nltk.pos_tag(i) for i in tokenized_sentences]
pos_sents = [item for sublist in pos_sents for item in sublist]

In [None]:
nouns = [word[0] for word in pos_sents if word[1][0] == 'N']
adjectives = [word[0] for word in pos_sents if word[1][0] == 'J']
verbs = [word[0] for word in pos_sents if word[1][0] == 'V']

nouns_mc = nltk.FreqDist(nouns).most_common(15)
verbs_mc = nltk.FreqDist(verbs).most_common(15)
adjectives_mc = nltk.FreqDist(adjectives).most_common(15)

In [None]:
#pretty plots

fig, axes = plt.subplots(3)

fig.set_size_inches(15, 10)

sns.barplot(x=[i[0] for i in nouns_mc], y=[i[1] for i in nouns_mc], 
            ax=axes[0])


sns.barplot(x=[i[0] for i in adjectives_mc], y=[i[1] for i in adjectives_mc], 
            ax=axes[1])


sns.barplot(x=[i[0] for i in verbs_mc], y=[i[1] for i in verbs_mc], 
            ax=axes[2], label='oksffas')



plt.show()

### Image Corpus