### Import Modules

In [1]:
import pandas as pd
import numpy as np
import spacy
import string
from scipy import stats
from matplotlib import pyplot as plt
from progress.bar import Bar

### Data Preparation

In [2]:
# filepath = '../data/mytrain.csv'
filepath = 'toy_set.csv' # a small set of 2000 questions for testing
df_data = pd.read_csv(filepath)

nlp = spacy.load('en_core_web_sm')

PUNCT_DICT = {'all_punctuation': string.punctuation, 'commas': ',', \
'periods': '.', 'quotation_marks': '\'\"', 'question_marks': '?', \
'exclamation_marks': '!', 'other_punctuations': [s for s in string.punctuation if s not in ',.\'\"?!']}

POS_LIST = ['ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB', 'ADP', 'AUX', \
'CCONJ', 'DET', 'NUM', 'PART', 'PRON', 'SCONJ', 'SYM', 'X']
# Reference: https://universaldependencies.org/u/pos/

ENT_LIST = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', \
'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY',
'ORDINAL', 'CARDINAL']
#reference: https://spacy.io/api/annotation#section-named-entities

### Build dictionaries for statistical information for one sentence

- Currently this covers 3 aspects: punctuation, pos, and entities
- The statistical information are all counts of occurrances 
    - **Might need more statistics?**
- The lists of pos and entity types are cited below
- Note that spacy provides a PUNCT tag in the pos-tagger, but here we define finer-grained categorization

In [3]:
def build_count_dict(sentence):
	""" Return count dictionaries for sentence mapping from labels to the count of words
	that satisfies a corresponding condition: 
	1 - char in char_type (punctuation)
	2 - taken.pos_ == pos (part of speech)
	3 - ent.label_ == ent (named entites)
	"""
	punc = {key: 0 for key in PUNCT_DICT.keys()}
	pos = {pos: 0 for pos in POS_LIST}
	ent = {ent: 0 for ent in ENT_LIST}

	doc = nlp(sentence)
	ents = doc.ents

	for word in sentence:
		for key, value in PUNCT_DICT.items():
			if word in value:
				punc[key] += 1

	for token in doc:
		if token.pos_ in POS_LIST:
			pos[token.pos_] += 1

	for e in ents:
		if e.label_ in ENT_LIST:
			ent[e.label_] += 1

	return punc, pos, ent

### Collect statistical data for all sentences

In [4]:
def data_collection(dataframe):
	""" Return statistical data of sentences with label, which is 0 for negative
	and 1 for positive. 
	"""
	sentences = dataframe['question_text'].values
	# punctuations
	punc = dict((key, []) for key in PUNCT_DICT.keys())
	# punc_count = dict((key, 0) for key in PUNCT_DICT.keys())
	pos = dict((pos, []) for pos in POS_LIST)
	# pos_count = dict((pos, 0) for pos in POS_LIST)
	ent = dict((ent, []) for ent in ENT_LIST)

	data_container = [punc, pos, ent]

	bar = Bar("Collecting data over sentences", max=len(sentences))
	for s in sentences:
		# punctuations
		punc_dict, pos_dict, ent_dict = build_count_dict(s)

		data = [punc_dict, pos_dict, ent_dict]

		for i in range(len(data)):
			for key, value in data[i].items():
				data_container[i][key].append(value)

		bar.next()
	bar.finish()
    
	for container in data_container:
		for key, value in container.items():
			dataframe[key] = pd.Series(value, index=dataframe.index) 

### Two Sample KS Testing

The purpose is to extract features of which the distributions in positive and negative datasets are significantly different. 

P-value threshold: 0.01 (standard for two-tailed test)

In [6]:
def ks_test(set1, set2, theme):
	""" Conduct KS test to compare set1 and set2. Print the results and return
	True iff set1 and set2 are significantly different at 0.001 level.  

	Theme is a text label for the comparison. 
	"""

	ks_test_score, ks_p_value = stats.ks_2samp(set1, set2)

	print("===== KS test for {} =====".format(theme))

	print("KS statistic: {}\np-value: {}".format(ks_test_score, ks_p_value))

	# Since it is a two-tailed test, the difference is considered significant
	# when p value is smaller thatn 0.01
	if ks_p_value < 0.01:
		print("The two distributions are significantly different. ")
		return True

	return False

### Executing the methods: main()

Getting raw data from data_collection function

In [None]:
	data_collection(df_data)
	df_positive, df_negative = df_data[df_data['target']==1], df_data[df_data['target'] == 0]

Containers for punctuation marks/PoS/entities counts

In [None]:
features = {'punctuation':[PUNCT_DICT.keys(), []], 'pos_tag':[POS_LIST, []], 'ent':[ENT_LIST, []]}

Getting statistical info

In [None]:
for key, value in features.items():
		for label in value[0]:
			if ks_test(df_positive[label].values, df_negative[label].values, label):
				value[1].append(label)
		df = df_data[value[1]]
		df['target'] = df_data['target']
# 		filename = '{}.csv'.format(key)
# 		df.to_csv(filename, index=0)

print test results

In [None]:
for key, value in features.items():
		print('{} test results: {}'.format(key, value[1]))

In [None]:
===== KS test for all_punctuation =====
KS statistic: 0.20649401214523827
p-value: 0.0
The two distributions are significantly different. 
===== KS test for commas =====
KS statistic: 0.14935060881751216
p-value: 0.0
The two distributions are significantly different. 
===== KS test for periods =====
KS statistic: 0.060830283120621775
p-value: 1.1036355412102202e-195
The two distributions are significantly different. 
===== KS test for quotation_marks =====
KS statistic: 0.0896970770949378
p-value: 0.0
The two distributions are significantly different. 
===== KS test for question_marks =====
KS statistic: 0.0675250841708418
p-value: 4.996541012816196e-241
The two distributions are significantly different. 
===== KS test for exclamation_marks =====
KS statistic: 0.005223242826926211
p-value: 0.07267448091854482
===== KS test for other_punctuations =====
KS statistic: 0.044723076204157164
p-value: 5.71709290637383e-106
The two distributions are significantly different. 
corpus_stats.py:134: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['target'] = df_data['target']
===== KS test for ADJ =====
KS statistic: 0.1935908580556876
p-value: 0.0
The two distributions are significantly different. 
===== KS test for ADV =====
KS statistic: 0.2741942028242801
p-value: 0.0
The two distributions are significantly different. 
===== KS test for INTJ =====
KS statistic: 0.005810648571668464
p-value: 0.03306683461969816
===== KS test for NOUN =====
KS statistic: 0.07901515248192403
p-value: 0.0
The two distributions are significantly different. 
===== KS test for PROPN =====
KS statistic: 0.19957844130372066
p-value: 0.0
The two distributions are significantly different. 
===== KS test for VERB =====
KS statistic: 0.23490756135964508
p-value: 0.0
The two distributions are significantly different. 
===== KS test for ADP =====
KS statistic: 0.1425457752970124
p-value: 0.0
The two distributions are significantly different. 
===== KS test for AUX =====
KS statistic: 0.0
p-value: 1.0
===== KS test for CCONJ =====
KS statistic: 0.14024186182060316
p-value: 0.0
The two distributions are significantly different. 
===== KS test for DET =====
KS statistic: 0.0981772568175312
p-value: 0.0
The two distributions are significantly different. 
===== KS test for NUM =====
KS statistic: 0.028908604027122253
p-value: 1.5941384267652604e-44
The two distributions are significantly different. 
===== KS test for PART =====
KS statistic: 0.09337191361933339
p-value: 0.0
The two distributions are significantly different. 
===== KS test for PRON =====
KS statistic: 0.06995257073260086
p-value: 1.2265959395833205e-258
The two distributions are significantly different. 
===== KS test for SCONJ =====
KS statistic: 0.0
p-value: 1.0
===== KS test for SYM =====
KS statistic: 0.004367102937732015
p-value: 0.19690057964334792
===== KS test for X =====
KS statistic: 0.0034428522295403274
p-value: 0.4674693650568761
===== KS test for PERSON =====
KS statistic: 0.09407880548837111
p-value: 0.0
The two distributions are significantly different. 
===== KS test for NORP =====
KS statistic: 0.3093578779584073
p-value: 0.0
The two distributions are significantly different. 
===== KS test for FAC =====
KS statistic: 0.0001372413203811762
p-value: 0.9999999999999998
===== KS test for ORG =====
KS statistic: 0.013272179404682394
p-value: 1.0137461597272936e-09
The two distributions are significantly different. 
===== KS test for GPE =====
KS statistic: 0.08024333595431177
p-value: 0.0
The two distributions are significantly different. 
===== KS test for LOC =====
KS statistic: 0.01593505246154603
p-value: 7.978103928117553e-14
The two distributions are significantly different. 
===== KS test for PRODUCT =====
KS statistic: 0.003329168579452646
p-value: 0.5110730082239134
===== KS test for EVENT =====
KS statistic: 0.002687291471993425
p-value: 0.7726293163529684
===== KS test for WORK_OF_ART =====
KS statistic: 0.0018686404864821649
p-value: 0.9837798597348894
===== KS test for LAW =====
KS statistic: 0.002270596561190663
p-value: 0.9127969713412695
===== KS test for LANGUAGE =====
KS statistic: 0.0013589432188911843
p-value: 0.9998745643907627
===== KS test for DATE =====
KS statistic: 0.016170799755453547
p-value: 3.180582105070732e-14
The two distributions are significantly different. 
===== KS test for TIME =====
KS statistic: 0.001761013430913283
p-value: 0.9917271436649813
===== KS test for PERCENT =====
KS statistic: 0.002277878972795744
p-value: 0.9108623253635588
===== KS test for MONEY =====
KS statistic: 0.001213247931561634
p-value: 0.9999914531426718
===== KS test for QUANTITY =====
KS statistic: 0.0010873408185718691
p-value: 0.9999996752966367
===== KS test for ORDINAL =====
KS statistic: 0.003573301528227546
p-value: 0.4198663662283268
===== KS test for CARDINAL =====
KS statistic: 0.011329958638504944
p-value: 3.367636618202617e-07
The two distributions are significantly different. 
punctuation test results: ['all_punctuation', 'commas', 'periods', 'quotation_marks', 'question_marks', 'other_punctuations']
pos_tag test results: ['ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB', 'ADP', 'CCONJ', 'DET', 'NUM', 'PART', 'PRON']
ent test results: ['PERSON', 'NORP', 'ORG', 'GPE', 'LOC', 'DATE', 'CARDINAL']