In [38]:
import os
import tarfile
import pandas
import json
import nltk.data
import nltk
import numpy as np
from scipy.sparse import csr_matrix
from collections import defaultdict
import itertools
import string
import array
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, precision_score, average_precision_score, fbeta_score, recall_score

In [3]:
sourceJ = 'data/araucaria/json'
sourceT = 'data/araucaria/txt'

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def read_files():
    for root, dir_names, file_names in os.walk(sourceJ):
        for file_name in file_names:
            file = open(os.path.join(sourceT, file_name[:-5]+'.txt'))
            content = tokenizer.tokenize(file.read())
            vals = []
            for line in open(os.path.join(root, file_name)):
                for node in json.loads(line)['nodes']:
                    vals.append(node['text'])

            r_val = [1]*len(vals)
            args= []
            non_args = []
            for con in content:
                is_args = False
                i = 0
                for val in vals:
                    if val in con:
                        is_args = True
                        r_val[i] = 0
                        break
                    i+=1
                if(is_args):
                    args.append(con)
                else:
                    non_args.append(con)
            
            i = 0
            for val in vals:
                if(r_val[i] and val!='RA'):
                    args.append(val)
                i+=1
                
            yield file_name, args, non_args

In [4]:
"""
file = open('output.txt', 'w')
for file_name, args, non_args in read_files():
    s = file_name, ':', args, ':', non_args
    file.write(str(s)+"\n")
"""

headings = ['arguments', 'non arguments']
index = []
data = []
for file_name, args, non_args in read_files():
    index.append(file_name)
    data.append([args, non_args])
    
df = pandas.DataFrame(index = index, data = data, columns = headings)

In [5]:
df.shape

(661, 2)

In [6]:
adverbs = ['RB', 'RBR', 'RBS']
verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
verbs_remove = ['to be', 'to do', 'to have']
#nltk.help.upenn_tagset()

In [7]:
def initialize():
    global word_list_1, word_list_2, word_list_3, adverb_list, verb_list, cnt_1, cnt_2, cnt_3, cnt_v, cnt_adv, cnt_c_2
#     word_couples
    word_list_1 = dict([])
    word_list_2 = dict([])
    word_list_3 = dict([])
    adverb_list = dict([])
    verb_list = dict([])
#     word_couples = dict([])
    cnt_1 = 0
    cnt_2 = 0
    cnt_3 = 0
    cnt_v = 0
    cnt_adv = 0
    cnt_c_2 = 0

In [8]:
initialize()
def initialize_dicts_s(data):
    global word_list_1, word_list_2, word_list_3, adverb_list, verb_list, word_couples, cnt_1, cnt_2, cnt_3, cnt_v, cnt_adv, cnt_c_2
    words = nltk.word_tokenize(data)
    for i, word in enumerate(words):
        if word not in word_list_1:
            word_list_1[word] = cnt_1
            cnt_1 += 1
        if i>0:
            word2 = '{} {}'.format(words[i-1], words[i])
            if word2 not in word_list_2:
                word_list_2[word2] = cnt_2
                cnt_2 += 1
        if i>1:
            word3 = '{} {} {}'.format(words[i-2], words[i-1], words[i])
            if word3 not in word_list_3:
                word_list_3[word3] = cnt_3
                cnt_3 += 1

    cnt_adv = 0
    cnt_v = 0
    pos_tags = nltk.pos_tag(words)
    for word, tag in pos_tags:
        if tag in adverbs:
            if word not in adverb_list:
                adverb_list[word] = cnt_adv
                cnt_adv += 1
        if tag in verbs and word not in verbs_remove:
            if word not in verb_list:
                verb_list[word] = cnt_v
                cnt_v += 1
"""
    cnt_c_2 = 0
    length = len(words)
    for i in range(length-1):
        for j in range(i+1, length):
            word_c_2 = '{} {}'.format(words[i], words[j])
            if word_c_2 not in word_couples:
                word_couples[word_c_2] = cnt_c_2
                cnt_c_2 += 1
"""

"\n    cnt_c_2 = 0\n    length = len(words)\n    for i in range(length-1):\n        for j in range(i+1, length):\n            word_c_2 = '{} {}'.format(words[i], words[j])\n            if word_c_2 not in word_couples:\n                word_couples[word_c_2] = cnt_c_2\n                cnt_c_2 += 1\n"

In [9]:
def initialize_dicts_all():
    initialize()
    for heading in headings:
        for content in df[heading]:
            for sentence in content:
                if(sentence != ''):
                    initialize_dicts_s(sentence)

In [10]:
def feature_vector(sentence):
    global word_list_1, word_list_2, word_list_3, adverb_list, verb_list, word_couples, cnt_1, cnt_2, cnt_3, cnt_v, cnt_adv, cnt_c_2

    feature_1 = csr_matrix([0]*(len(word_list_1)))
    feature_2 = csr_matrix([0]*len(word_list_2))
    feature_3 = csr_matrix([0]*len(word_list_3))
    feature_adv = csr_matrix([0]*len(adverb_list))
    feature_v = csr_matrix([0]*len(verb_list))
#     feature_couples = [0]*len(word_couples)
    
    words = nltk.word_tokenize(sentence)
    for i, word in enumerate(words):
        feature_1[0, word_list_1[word]] = 1
        if word in adverb_list:
            feature_adv[0, adverb_list[word]] = 1
        if word in verb_list:
            feature_v[0, verb_list[word]] = 1
        if i>0:
            word2 = '{} {}'.format(words[i-1], words[i])
            feature_2[0, word_list_2[word2]] = 1
        if i>1:
            word3 = '{} {} {}'.format(words[i-2], words[i-1], words[i])
            feature_3[0, word_list_3[word3]] = 1

#     feature_couples = ''.join(map(str, feature_couples))
#     feature = feature_1 + feature_2 + feature_3 + feature_adv + feature_v + feature_couples
    feature = csr_matrix([feature_1, feature_2,  feature_3,  feature_adv,  feature_v])
    return feature

"""
    feature_1 = hash(''.join(map(str, feature_1)))
    feature_2 = hash(''.join(map(str, feature_2)))
    feature_3 = hash(''.join(map(str, feature_3)))
    feature_adv = hash(''.join(map(str, feature_adv)))
    feature_v = hash(''.join(map(str, feature_v)))
"""
"""
    length = len(words)
    for i in range(length-1):
        for j in range(i+1, length):
            word_c_2 = '{} {}'.format(words[i], words[j])
            feature_couples[word_couples[word_c_2]] = 1
"""

"\n    length = len(words)\n    for i in range(length-1):\n        for j in range(i+1, length):\n            word_c_2 = '{} {}'.format(words[i], words[j])\n            feature_couples[word_couples[word_c_2]] = 1\n"

In [11]:
def train_data():
    initialize_dicts_all()
    feature_vector_list = []
    targets = []
    for heading in headings:
        for content in df[heading]:
            for sentence in content:
                if sentence != '':
                    feature_vector_list.append(feature_vector(sentence))
                    targets.append(heading)
    
    return feature_vector_list, targets

In [15]:
feature_vector_list, targets = train_data()



SystemError: PyEval_EvalFrameEx returned a result with an error set

In [33]:
len(targets)

6702

In [35]:
len(feature_vector_list)

6702

In [1]:
a = [1, 2, 3, 4]
a = str(a)

In [12]:
def _make_int_array():
    return array.array(str("i"))

In [13]:
def _ngrams(tokens):
    o_tokens = tokens
    tokens = []
    n_o_tokens = len(o_tokens)
    
    for n in range(2,4):
        for j in range(n_o_tokens - n + 1):
            yield ' '.join(o_tokens[j:j+n])

In [14]:
def _couples(tokens):
    length = len(tokens)
    for i in range(length-1):
        for j in range(i+1, length):
            yield 'c_{} {}'.format(tokens[i], tokens[j])

In [15]:
def _adverbs(pos_tags):
    p = False
    for word, tag in pos_tags:
        if tag in adverbs:
            p = True
            yield 'adv_' + word

    if not p:
        yield 'no_adverbs'

In [16]:
def _verbs(pos_tags):
    p = False
    for word, tag in pos_tags:
        if tag in verbs and word not in verbs_remove:
            p = True
            yield 'v_' + word
            
    if not p:
        yield 'no_verbs'

In [17]:
def _modal_aux(pos_tags):
    for word, tag in pos_tags:
        if tag == 'MD':
            return 'MD_1', 1
    return 'MD_1', 0

In [18]:
def _sen_len(tokens):
    yield 's_len', len(tokens)

In [19]:
def _avg_word_len(tokens):
    c_len = 0
    for token in tokens:
        c_len += len(token)
    yield 'avg_word_len', round(c_len/len(tokens))

In [20]:
def _punc_data(s):
    punc_list = list(filter(lambda c: c in s, string.punctuation))
    punc_len = len(punc_list)
    yield 'punc_len', punc_len
    single_punc = []
    i = 0
    while i < punc_len:
        single_punc.append(punc_list[i])
        last = punc_list[i]
        i += 1
        while(i < punc_len and punc_list[i] == last):
            i += 1
            
    yield ''.join(single_punc), 1

In [21]:
def initialize_new():
    global features_
    features_ = defaultdict()
    features_.default_factory = features_.__len__


In [47]:
def _feature_dict(sentence, fixed_features):
    global features_
    typeF = ''
    if fixed_features == True:
        features = dict(features_)
        typeF = 'dict'
    else:
        features = features_
        typeF = 'defaultDict'

    feature_dict = {}
    tokens = nltk.word_tokenize(sentence)
    if len(tokens) > 2:
        pos_tags = nltk.pos_tag(tokens)

        tag, val = _modal_aux(pos_tags)
        if typeF == 'dict':
            if tag in features:
                feature_num = features[tag]
                feature_dict[feature_num] = val
        else:
            feature_num = features[tag]
            feature_dict[feature_num] = val

        for tag, val in itertools.chain(_sen_len(tokens), _avg_word_len(tokens), _punc_data(sentence)):
            if typeF == 'dict':
                if tag in features:
                    feature_num = features[tag]
                    feature_dict[feature_num] = val
            else:
                feature_num = features[tag]
                feature_dict[feature_num] = val

        for token in itertools.chain(_adverbs(pos_tags), _verbs(pos_tags), tokens, _ngrams(tokens), _couples(tokens)):
            if typeF == 'dict':
                if token in features:
                    feature_num = features[token]
                    if feature_num not in feature_dict:
                        feature_dict[feature_num] = 1
                    else:
                        feature_dict[feature_num] += 1
            else:
                feature_num = features[token]
                if feature_num not in feature_dict:
                    feature_dict[feature_num] = 1
                else:
                    feature_dict[feature_num] += 1

    return feature_dict

In [48]:
def fit_transform(dataFrame, fixed_features=False):
    global features_
    j_indices = []
    indptr = _make_int_array()
    values = []
    indptr.append(0)
    targets = []
    for i, heading in enumerate(headings):
        for content in dataFrame[heading]:
            for sentence in content:
                feature_dict = _feature_dict(sentence, fixed_features)
                # print(sentence, feature_dict)
                if len(feature_dict) != 0:
                    targets.append(i)
                    j_indices.extend(feature_dict.keys())
                    values.extend(feature_dict.values())
                    indptr.append(len(j_indices))

    j_indices = np.asarray(j_indices, dtype=np.intc)
    indptr = np.frombuffer(indptr, dtype=np.intc)
    targets = np.asarray(targets, dtype=np.intc)
    X = csr_matrix((values, j_indices, indptr), shape = (len(indptr) - 1, len(features_)), dtype=np.int64)
    return X, targets

In [49]:
initialize_new()
X, targets = fit_transform(df)

In [50]:
type(X)

scipy.sparse.csr.csr_matrix

In [51]:
len(targets)

6543

In [52]:
X.shape

(6543, 701401)

In [53]:
X.data

array([ 1, 23,  5, ...,  1,  1,  1])

In [54]:
classifier = MultinomialNB()
classifier.fit(X, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [55]:
next(df.iterrows())[1]

arguments        [Indonesia cannot afford to become a haven for...
non arguments                                                   []
Name: nodeset190.json, dtype: object

In [56]:
df['arguments'][0]

['Indonesia cannot afford to become a haven for an Islamic radicalism which would wreck the economy and tear the archipelago apart.',
 'The war on terror will be intensified.',
 'If Indonesia wants to avoid becoming a haven for Islamic radicalism the war on terror has to be intensified',
 'There is a possibility for Indonesia to become a haven for Islamic radicalism',
 'which would wreck the economy and tear the archipelago apart.']

In [57]:
df['non arguments'][4]

['{8} During the last thirty years, scientists have discovered that the existence of intelligent life depends upon a complex and delicately balanced set of initial conditions simply given in the Big Bang itself.',
 'We now know that life-prohibiting universes are vastly more probable than life- permitting universes like ours.',
 'How much more probable?',
 'Well, before I give you an estimation, let me just give you some numbers to give you a feel for the odds.',
 "The number of seconds in the history of the universe is about 1018, that's ten followed by eighteen zeros.",
 'The number of subatomic particles in the entire universe is about1080.',
 'Now with those numbers in mind, consider the following.',
 "Donald Page, one of America's eminent cosmologists, has calculated the odds of our universe existing as on the order of one chance out of 1010(123), a number which is so inconceivable that to call it astronomical would be a wild understatement!",
 '{9} Robert Jastrow, the head of NAS

In [58]:
type(df.iloc[[0]])

pandas.core.frame.DataFrame

In [59]:
# j_indices = []
# values = []
# indptr = _make_int_array()
# indptr.append(0)

# feature_dict = _feature_dict('Indonesia cannot afford to become a haven for an Islamic radicalism which would wreck the economy and tear the archipelago apart.')
# j_indices = np.asarray(j_indices, dtype=np.intc)
# indptr = np.frombuffer(indptr, dtype=np.intc)
# targets = np.asarray(targets, dtype=np.intc)

X, targets = fit_transform(df.iloc[[0]])

classifier.predict(X)

array([0, 0, 0, 0, 0], dtype=int32)

In [60]:
X.shape

(5, 701401)

In [61]:
len(targets)

5

In [69]:
def kFoldTest(data, classifierT = 'multinomialNB'):
#     pipeline = Pipeline([
#             ('vectorizer', Vectorize()),
#             ('classifier', classifier)
#         ])

    kFold = KFold(n = len(data), n_folds = 4)
    scores = []
    p_scores = []
    r_scores = []
    confusionMatrix = np.array([[0]*2]*2)
#     vectorizer = Vectorize()
    #classifier = MultinomialNB()
    #classifier = BernoulliNB()
    #classifier = SVC()
    #classifier = RandomForestClassifier()
    for train_indices, test_indices in kFold:
        train_data = data.iloc[train_indices]
        test_data = data.iloc[test_indices]

        if(classifierT == 'multinomialNB'):
            classifier = MultinomialNB()
        elif(classifierT == 'maxent'):
            classifier = LogisticRegression()

        initialize_new()

        X, targets = fit_transform(train_data)        
        train_y = targets
        classifier.fit(X, train_y)

        print(X.shape)
        
        X, targets = fit_transform(test_data, fixed_features=True)

        print(X.shape)

        predictions = classifier.predict(X)
        test_y = targets

        # print(test_y.shape)
        # print(predictions.shape)
        # print(confusion_matrix(test_y, predictions))
        confusionMatrix += confusion_matrix(test_y, predictions)
        score = f1_score(test_y, predictions, average='binary')
        p_score = precision_score(test_y, predictions, average='binary')
        r_score = recall_score(test_y, predictions, average='binary')
        
        p_scores.append(p_score)
        r_scores.append(r_score)        
        scores.append(score)

    print('Score:', sum(scores)/len(scores))
    print('Precision Score:', sum(p_scores)/len(p_scores))
    print('Recall Score:', sum(r_scores)/len(r_scores))
    print('Confusion matrix:')
    print(confusionMatrix)

In [67]:
kFoldTest('multinomialNB', df)

(4978, 583322)
(1565, 583322)
(4861, 580951)
(1682, 580951)
(4868, 563002)
(1675, 563002)
(4922, 573278)
(1621, 573278)
Score: 0.73806027188
Precision Score: 0.767051730881
Recall Score: 0.71167835531
Confusion matrix:
[[3484  543]
 [ 727 1789]]


In [70]:
kFoldTest(df, classifierT='maxent')

(4978, 583322)
(1565, 583322)
(4861, 580951)
(1682, 580951)
(4868, 563002)
(1675, 563002)
(4922, 573278)
(1621, 573278)
Score: 0.825608135503
Precision Score: 0.827634422406
Recall Score: 0.823856576752
Confusion matrix:
[[3596  431]
 [ 444 2072]]


In [170]:
def try1():
    yield 1, 2
    yield 3, 4
    
def try2():
    return [1, 2]

for i, j in itertools.chain(try1(), try2()):
    print(i, j)

1 2
3 4


TypeError: 'int' object is not iterable

In [171]:
s = 'a,b;c.'.translate(string.punctuation)
s

'a,b;c.'

In [172]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [173]:
list(filter(lambda c: c in s, string.punctuation))

[',', '.', ';']

In [174]:
for s in _ngrams(nltk.word_tokenize('hi I am Varun Raval')):
    print(s)

hi I
I am
am Varun
Varun Raval
hi I am
I am Varun
am Varun Raval


In [42]:
d = defaultdict()
d.default_factory = None
d[1] = 'a'

In [44]:
type(d)

collections.defaultdict

In [45]:
e = {}

In [46]:
type(e)

dict