# Sanity check
In this notebook we will try to check if there is no strong correlation with the label artificially introduced in the datasets that might explain the very high performance of the model on both the training and the evaulation sets.

In [1]:
%matplotlib inline
# standard library
import itertools
import sys, os

# pandas
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask


# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# tesnsorflow
import tensorflow as tf

from tensorflow.contrib.hooks import ProfilerHook
# local imports
sys.path.append(os.path.join(os.getcwd(), "../src"))

from tf_utils import make_csv_pipeline, make_csv_col_tensors, csv_dataset, csv_to_tf_types, tfrecord_dataset

# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

In [2]:
# read the data
ddf = dd.read_csv('../data/final/cleaneval/dom-*.csv')

In [3]:
# compute the pearson correlation matrix(highly expensive operation)
# must do it on a sample, it tanks the memory on the enitre dataset
df = ddf.drop(['url', 'path'], axis=1).sample(0.4).astype(float).compute().reset_index(drop=True)
df.head()

Unnamed: 0,depth,sibling_pos,no_classes,id_len,class_len,no_children,text_len,descendant1_no_nodes,descendant1_no_children_avg,descendant1_id_len_avg,...,ancestor5_tag_content_area,ancestor5_tag_layer,ancestor5_tag_defanged_meta,ancestor5_tag_storysumm,ancestor5_tag_beginlock,ancestor5_tag_endlock,ancestor5_tag_dt,ancestor5_tag_noedit,ancestor5_tag_small,content_label
0,5.0,0.0,0.0,0.0,0.0,1.0,20.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11.0,1.0,0.0,0.0,0.0,1.0,25.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12.0,0.0,0.0,0.0,0.0,1.0,1096.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
corr = df.corr()

In [5]:
corr['content_label'].dropna().abs().sort_values(ascending=False) # check to see the correlations

content_label                1.000000
tag_p                        0.394501
ancestor2_text_len           0.179105
tag_br                       0.157079
ancestor3_text_len           0.154664
ancestor1_text_len           0.151503
ancestor4_text_len           0.117555
tag_font                     0.106212
tag_img                      0.101715
tag_b                        0.091179
tag_a                        0.090687
tag_td                       0.088709
ancestor1_tag_tr             0.088302
tag_h3                       0.086527
ancestor5_text_len           0.085796
ancestor1_tag_blockquote     0.084335
descendant1_tags_b           0.081654
ancestor2_tag_table          0.080805
ancestor1_no_children        0.079205
descendant1_tags_font        0.074444
tag_html_comment             0.073805
tag_h2                       0.072860
descendant1_tags_i           0.070126
sibling_pos                  0.066303
ancestor4_depth              0.065758
ancestor5_depth              0.065136
ancestor3_de

Also, check to see if there is no overlap between te validation, tst and training

In [3]:
train_ddf = dd.read_csv('../data/final/cleaneval/dom-full-train-*.csv')
valid_ddf = dd.read_csv('../data/final/cleaneval/dom-full-validation-*.csv')
test_ddf = dd.read_csv('../data/final/cleaneval/dom-full-test-*.csv')

# get the urls
train_urls = train_ddf['url'].unique().compute()
valid_urls = valid_ddf['url'].unique().compute()
test_urls = test_ddf['url'].unique().compute()

In [11]:
set(train_urls) & set(test_urls)

set()

In [None]:
ddf = dd.read_csv('../data/final/')