In [1]:
#First meeting about sherlock where I take a closer look at the sherlock dataset

In [2]:
#Reload modules before executing code
%load_ext autoreload
%autoreload 2

In [3]:
from datetime import datetime
from os.path import join


import numpy as np
import pandas as pd
from pyarrow.parquet import ParquetFile


from sherlock import helpers
from sherlock.features.preprocessing import (
    convert_string_lists_to_lists,
    load_parquet_values,
)
from sherlock.functional import *

import altair as alt
alt.renderers.enable('default')

# from functional import pseq

RendererRegistry.enable('default')

# Download the data
In case the sherlock data has not been downloaded yet, it will be downloaded into the data/ directory

In [4]:
helpers.download_data()

Downloading the raw data into ../data/data/.
Data was downloaded.


# Loading the datasets

In [5]:
path_raw = "../data/data/raw/"
path_processed = "../data/data/processed/"

start = datetime.now()
temp_data = load_parquet_values(join(path_raw, "temporary.parquet"))
test_data = load_parquet_values(join(path_raw, "test_values.parquet"))
train_data = load_parquet_values(join(path_raw, "train_values.parquet"))
validation_data = load_parquet_values(join(path_raw, "val_values.parquet"))
print(f'Successfully loaded the data in {datetime.now() - start} seconds')

Successfully loaded the data in 0:00:07.197848 seconds


# Analysing the datasets
First, when using the debugger we can notice that the loaded parquet values are of type pyarrow.ChunkedArray

In [6]:
print(f"temp_data: {len(temp_data)}")
print(f"test_data: {len(test_data)}")
print(f"train_data: {len(train_data)}")
print(f"validation_data: {len(validation_data)}")

temp_data: 2
test_data: 137353
train_data: 412059
validation_data: 137353


## Parquet file inspection

In [7]:
filename = "test_values.parquet"
pf = ParquetFile(source=join(path_raw, filename))

In [8]:
metadata = pf.metadata
metadata

<pyarrow._parquet.FileMetaData object at 0x7f61c549c470>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 2
  num_rows: 137353
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 2099

In [9]:
metadata.row_group(0)

<pyarrow._parquet.RowGroupMetaData object at 0x7f61c549cb30>
  num_columns: 2
  num_rows: 137353
  total_byte_size: 92885782

In [10]:
pf.schema

<pyarrow._parquet.ParquetSchema object at 0x7f61c54ab050>
required group field_id=0 schema {
  optional binary field_id=1 values (String);
  optional int64 field_id=2 __index_level_0__;
}

In [11]:
row_df = pf.read_row_group(0)
row_df
values = row_df['values']
indices = row_df['__index_level_0__']

In [12]:
values.slice(0, 3)

<pyarrow.lib.ChunkedArray object at 0x7f61c54a77d0>
[
  [
    "['Central Missouri', 'unattached', 'unattached', 'Kansas State University', 'unattached', 'North Dakota State', 'Nike']",
    "[95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, 84, 90, 76, 93, 70, 80, 80, 82]",
    "['Katie Crews', 'Christian Hiraldo', 'Alex Estrada', 'Fredy Peltroche', 'Xavier Perez', 'Gustavo Larrosa', 'Jose Montano', 'Angel Cruz (7)', 'J Acosta']"
  ]
]

In [13]:
indices.slice(0,3)

<pyarrow.lib.ChunkedArray object at 0x7f61c54a7bf0>
[
  [
    20368,
    664102,
    366813
  ]
]

## Load values and labels using pandas dataframes for further investigation

In [14]:
start = datetime.now()
print(f'Started at {start}')

values_type = 'train'

#raw data
values = pd.read_parquet(join(path_raw, values_type+'_values.parquet'))
labels = pd.read_parquet(join(path_raw, values_type+'_labels.parquet'))

# training and validation of sherlock is done using an array of the types, thus we need 
# to flatten the values. As wel as the processed features of the values.
labels_flatten = labels.values.flatten() 

print(f'Load data (train) process took {datetime.now() - start} seconds.')

#Important for processing gittables later on, not so important right now.
# print(f'Check distinct types for columns in the Dataframe (should be all float32): {set(values_processed.dtypes)}')
# print(set(values_processed.dtypes))

Started at 2022-05-30 11:23:37.068351
Load data (train) process took 0:00:10.726090 seconds.


## Dataset investigation

In [15]:
#preparing data to represent in graphs
joined_df = pd.merge(labels, values, left_index=True, right_index=True)

#type occurrence
type_occurrence_series = joined_df.type.value_counts()
type_occurrence_df = pd.DataFrame({'type':type_occurrence_series.index, 'count': type_occurrence_series.values})

In [16]:
joined_df.head(10)

Unnamed: 0,type,values
55030,area,"['Global', 'United States', 'Australia']"
167000,collection,"['Fiction, Adult - Non-Floating', 'Fiction, Ad..."
638282,team Name,"['', '', 'University of Puerto Rico - Rio Pied..."
232298,credit,"['Laughology', 'MTV', 'With Intent to Kill', '..."
316158,gender,"['Mare', 'Gelding', 'Gelding', 'Gelding', 'Gel..."
467776,position,"['V.P., General Counsel & Sec.', 'V.P., Genera..."
149640,club,"['GAJA', 'OREG', 'UCS', 'WCM', 'SLAM', 'ARIZ',..."
23556,affiliation,"['Applied Mathematics, University of Notre Dam..."
263802,description,['wakeup time in seconds for pbid to run its c...
476881,position,"[35.0, 4.0, 52.0, 0.0, 30.0, 64.0, 84.0, None]"


In [17]:
alt.Chart(type_occurrence_df).mark_bar(size=8).encode(
    x = alt.X('type:O',
              title = 'Semantic Types',
              sort=alt.EncodingSortField(
                field="count",  
                order="descending")),
    y = alt.Y('count', title='Number of Samples')    
).properties(width=800,height=200)

In [18]:
# joined_df.agg({'values': ['sum', 'min']})

## Combining the datasets to validate paper information

In [19]:
#raw data
test_values = pd.read_parquet(join(path_raw, 'test_values.parquet'))
test_labels = pd.read_parquet(join(path_raw, 'test_labels.parquet'))
train_values = pd.read_parquet(join(path_raw, 'train_values.parquet'))
train_labels = pd.read_parquet(join(path_raw, 'train_labels.parquet'))
validation_values = pd.read_parquet(join(path_raw, 'val_values.parquet'))
validation_labels = pd.read_parquet(join(path_raw, 'val_labels.parquet'))

#joining dataframes
test_joined_df = pd.merge(test_labels, test_values, left_index=True, right_index=True)
train_joined_df = pd.merge(train_labels, train_values, left_index=True, right_index=True)
validation_joined_df = pd.merge(validation_labels, validation_values, left_index=True, right_index=True)

combined_df_pieces = {"train": train_joined_df, "validation": validation_joined_df, "test": test_joined_df}
combined_df = pd.concat(combined_df_pieces)
combined_df.index.names = ['usage', 'idx']

#type occurrence
# combined_type_occurrence_series = combined_df.type.value_counts()
# combined_type_occurrence_df = pd.DataFrame({'type':combined_type_occurrence_series.index, 'count': combined_type_occurrence_series.values})

# #type occurrence series
# test_type_occurrence_series = test_joined_df.type.value_counts()
# train_type_occurrence_series = train_joined_df.type.value_counts()
# validation_type_occurrence_series = validation_joined_df.type.value_counts()

# #type occurrence to dataframe
# test_type_occurrence_df = pd.DataFrame({'type':test_type_occurrence_series.index, 'count': test_type_occurrence_series.values})
# train_type_occurrence_df = pd.DataFrame({'type':train_type_occurrence_series.index, 'count': test_type_occurrence_series.values})
# validation_type_occurrence_df = pd.DataFrame({'type':validation_type_occurrence_series.index, 'count': test_type_occurrence_series.values})

# combined_type_occurrence_keys = {"test": test_type_occurrence_df, "train": train_type_occurrence_df, "validation": validation_type_occurrence_df}
# combined_type_occurrence_df = pd.concat(combined_type_occurrence_keys)

# We can use combined_df to replicate all of the above code as follows:
# combined_df.groupby('type').count().reset_index()

In [20]:
chart_type_df = combined_df.groupby('type').count().reset_index()
chart_type_df = chart_type_df.sort_values(axis=0, by=['values'], ascending=False).reset_index(drop=True)
shown_types = chart_type_df.type.to_list()
shown_types = [val for (idx, val) in enumerate(shown_types) if idx % 4 == 0]
print(shown_types)
alt.Chart(chart_type_df).mark_bar(size=10).encode(
    x = alt.X('type:O',
              title = 'Semantic Types',
              sort=alt.EncodingSortField(
                field="values",  
                order="descending"),
               axis=alt.Axis(values=shown_types)),
    y = alt.Y('values', title='Number of Samples'),
    color=alt.condition(
        alt.Predicate(alt.FieldOneOfPredicate(field='type', oneOf=shown_types)),
        alt.value('orange'),
        alt.value('steelblue')
    ),
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
    
).properties(width=800,height=200)

['address', 'description', 'name', 'sex', 'type', 'city', 'code', 'format', 'notes', 'symbol', 'plays', 'isbn', 'gender', 'credit', 'range', 'collection', 'ranking', 'file Size', 'sales', 'director']


In [21]:
# alt.Chart(combined_type_occurrence_df).mark_bar(size=8).encode(
#     x = alt.X('type:O',
#               title = 'Semantic Types',
#               sort=alt.EncodingSortField(
#                 field="count",  
#                 order="descending")),
#     y = alt.Y('count', title='Number of Samples')    
# ).properties(width=800,height=200)

In [22]:
alt.Chart(combined_df.groupby(['usage', 'type']).count().reset_index()).mark_bar(size=8).encode(
#     facet='usage',
    x = alt.X('type',
              title = 'Semantic Types',
              sort=alt.EncodingSortField(
                field="values",  
                order="descending")),
    y = alt.Y('sum(values)', title='Number of Samples'),
    color='usage',
).properties(width=800,height=200)

In [23]:
combined_df.groupby(['usage', 'type']).count().unstack().sort_values(by = 'train', axis = 1, ascending = False)

Unnamed: 0_level_0,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values
type,type,rank,class,location,company,age,county,weight,club,sex,...,religion,file Size,education,sales,capacity,requirement,organisation,director,continent,affiliate
usage,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
train,9088,9058,9057,9056,9048,9041,9029,9024,9022,9017,...,1013,1009,1006,1004,976,954,852,742,737,584
validation,3003,2959,2972,2995,2911,2926,3012,3013,3001,2986,...,313,365,304,295,304,289,243,246,253,180
test,2909,2983,2971,2949,3041,3033,2959,2963,2977,2997,...,340,361,313,322,362,300,262,225,227,204
