In [1]:
import os
import tarfile
import pandas
import json
import nltk.data
import nltk
import numpy as np
from scipy.sparse import csr_matrix
from collections import defaultdict
import itertools
import string
import array
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, precision_score, average_precision_score, fbeta_score, recall_score

In [91]:
def get_pos_tags(sentence):
    return nltk.pos_tag(nltk.word_tokenize(sentence))

In [75]:
def read_files(sources):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    for source in sources:
        print(source)
        sourceJ = source[0]
        sourceT = source[1]
        for root, dir_names, file_names in os.walk(sourceJ):
            for file_name in file_names:
                try:
                    file = open(os.path.join(sourceT, file_name[:-5]+'.txt'))
                    content = tokenizer.tokenize(file.read())
                except:
                    file = open(os.path.join(sourceT, file_name[:-5]+'.txt'), encoding='windows-1252')
                    content = tokenizer.tokenize(file.read())
                vals = []
                encoding='utf-8'
                while True:
                    try:
                        for line in open(os.path.join(root, file_name), encoding=encoding):
                            for node in json.loads(line)['nodes']:
                                vals.append(node['text'])
                        break
                    except:
                        encoding='windows-1252'

                r_val = [1]*len(vals)
                args= []
                non_args = []
                pos_args = []
                pos_non_args = []
                for con in content:
                    is_args = False
                    i = 0
                    for val in vals:
                        if val in con:
                            is_args = True
                            r_val[i] = 0
                            break
                        i+=1
                    pos_con = nltk.pos_tag(nltk.word_tokenize(con))
                    if(is_args):
                        args.append(con)
                        pos_args.append(pos_con)
                    else:
                        non_args.append(con)
                        pos_non_args.append(pos_con)

                i = 0
                for val in vals:
                    if(r_val[i] and val!='RA'):
                        pos_val = nltk.pos_tag(nltk.word_tokenize(val))
                        args.append(val)
                        pos_args.append(pos_val)
                    i+=1

                yield file_name, args, non_args, pos_args, pos_non_args

In [76]:
"""
file = open('output.txt', 'w')
for file_name, args, non_args in read_files():
    s = file_name, ':', args, ':', non_args
    file.write(str(s)+"\n")
"""
def get_df(sources):
    headings = ['arguments', 'non arguments']
    index = []
    data = []
    pos_data = []
    for file_name, args, non_args, pos_args, pos_non_args in read_files(sources):
        index.append(file_name)
        data.append([args, non_args])
        pos_data.append([pos_args, pos_non_args])

    df = pandas.DataFrame(index = index, data = data, columns = headings)
    pos_df = pandas.DataFrame(index = index, data = pos_data, columns = headings)
    return df, pos_df

In [77]:
sources = [['data/araucaria/json', 'data/araucaria/txt']]
df, pos_df = get_df(sources)

['data/araucaria/json', 'data/araucaria/txt']


In [78]:
df.shape

(661, 2)

In [5]:
adverbs = ['RB', 'RBR', 'RBS']
verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
verbs_remove = ['to be', 'to do', 'to have']
#nltk.help.upenn_tagset()

In [6]:
def _make_int_array():
    return array.array(str("i"))

In [7]:
def _ngrams(tokens):
    o_tokens = tokens
    tokens = []
    n_o_tokens = len(o_tokens)
    
    for n in range(2,4):
        for j in range(n_o_tokens - n + 1):
            yield ' '.join(o_tokens[j:j+n])

In [8]:
def _couples(tokens):
    length = len(tokens)
    for i in range(length-1):
        for j in range(i+1, length):
            yield 'c_{} {}'.format(tokens[i], tokens[j])

In [9]:
def _adverbs(pos_tags):
    p = False
    for word, tag in pos_tags:
        if tag in adverbs:
            p = True
            yield 'adv_' + word

    if not p:
        yield 'no_adverbs'

In [10]:
def _verbs(pos_tags):
    p = False
    for word, tag in pos_tags:
        if tag in verbs and word not in verbs_remove:
            p = True
            yield 'v_' + word
            
    if not p:
        yield 'no_verbs'

In [11]:
def _modal_aux(pos_tags):
    for word, tag in pos_tags:
        if tag == 'MD':
            return 'MD_1', 1
    return 'MD_1', 0

In [12]:
def _sen_len(tokens):
    yield 's_len', len(tokens)

In [13]:
def _avg_word_len(tokens):
    c_len = 0
    for token in tokens:
        c_len += len(token)
    yield 'avg_word_len', round(c_len/len(tokens))

In [14]:
def _punc_data(s):
    punc_list = list(filter(lambda c: c in s, string.punctuation))
    punc_len = len(punc_list)
    yield 'punc_len', punc_len
    single_punc = []
    i = 0
    while i < punc_len:
        single_punc.append(punc_list[i])
        last = punc_list[i]
        i += 1
        while(i < punc_len and punc_list[i] == last):
            i += 1
            
    yield ''.join(single_punc), 1

In [15]:
def initialize_new():
    global features_
    features_ = defaultdict()
    features_.default_factory = features_.__len__


In [16]:
def _feature_dict(sentence, pos_tags, fixed_features):
    global features_
    typeF = ''
    if fixed_features == True:
        features = dict(features_)
        typeF = 'dict'
    else:
        features = features_
        typeF = 'defaultDict'

    feature_dict = {}
    tokens = nltk.word_tokenize(sentence)
    if len(tokens) > 2:
#         pos_tags = nltk.pos_tag(tokens)

        tag, val = _modal_aux(pos_tags)
        if typeF == 'dict':
            if tag in features:
                feature_num = features[tag]
                feature_dict[feature_num] = val
        else:
            feature_num = features[tag]
            feature_dict[feature_num] = val

        for tag, val in itertools.chain(_sen_len(tokens), _avg_word_len(tokens), _punc_data(sentence)):
            if typeF == 'dict':
                if tag in features:
                    feature_num = features[tag]
                    feature_dict[feature_num] = val
            else:
                feature_num = features[tag]
                feature_dict[feature_num] = val

        for token in itertools.chain(_adverbs(pos_tags), _verbs(pos_tags), tokens, _ngrams(tokens), _couples(tokens)):
            if typeF == 'dict':
                if token in features:
                    feature_num = features[token]
                    if feature_num not in feature_dict:
                        feature_dict[feature_num] = 1
                    else:
                        feature_dict[feature_num] += 1
            else:
                feature_num = features[token]
                if feature_num not in feature_dict:
                    feature_dict[feature_num] = 1
                else:
                    feature_dict[feature_num] += 1

    return feature_dict

In [102]:
def get_features(feature_dict):
    global features_
    features = np.zeros(len(features_))
    for key in feature_dict:
        features[key] = feature_dict[key]
    
    return features.reshape(1, -1)

In [17]:
def fit_transform(dataFrame, pos_dataFrame, fixed_features=False):
    global features_
    j_indices = []
    indptr = _make_int_array()
    values = []
    indptr.append(0)
    targets = []
    for i, heading in enumerate(headings):
        for j, content in enumerate(dataFrame[heading]):
            for k, sentence in enumerate(content):
                feature_dict = _feature_dict(sentence, pos_dataFrame[heading][j][k], fixed_features)
                # print(sentence, feature_dict)
                if len(feature_dict) != 0:
                    targets.append(i)
                    j_indices.extend(feature_dict.keys())
                    values.extend(feature_dict.values())
                    indptr.append(len(j_indices))

    j_indices = np.asarray(j_indices, dtype=np.intc)
    indptr = np.frombuffer(indptr, dtype=np.intc)
    targets = np.asarray(targets, dtype=np.intc)
    X = csr_matrix((values, j_indices, indptr), shape = (len(indptr) - 1, len(features_)), dtype=np.int64)
    return X, targets

In [18]:
initialize_new()
X, targets = fit_transform(df, pos_df)

In [19]:
type(X)

scipy.sparse.csr.csr_matrix

In [20]:
len(targets)

6543

In [21]:
X.shape

(6543, 701401)

In [22]:
X.data

array([ 1, 23,  5, ...,  1,  1,  1])

In [23]:
classifier = MultinomialNB()
classifier.fit(X, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [42]:
next(df.iterrows())[1]

arguments        [Indonesia cannot afford to become a haven for...
non arguments                                                   []
Name: nodeset190.json, dtype: object

In [43]:
df['arguments'][0]

['Indonesia cannot afford to become a haven for an Islamic radicalism which would wreck the economy and tear the archipelago apart.',
 'The war on terror will be intensified.',
 'If Indonesia wants to avoid becoming a haven for Islamic radicalism the war on terror has to be intensified',
 'There is a possibility for Indonesia to become a haven for Islamic radicalism',
 'which would wreck the economy and tear the archipelago apart.']

In [44]:
df['non arguments'][4]

['{8} During the last thirty years, scientists have discovered that the existence of intelligent life depends upon a complex and delicately balanced set of initial conditions simply given in the Big Bang itself.',
 'We now know that life-prohibiting universes are vastly more probable than life- permitting universes like ours.',
 'How much more probable?',
 'Well, before I give you an estimation, let me just give you some numbers to give you a feel for the odds.',
 "The number of seconds in the history of the universe is about 1018, that's ten followed by eighteen zeros.",
 'The number of subatomic particles in the entire universe is about1080.',
 'Now with those numbers in mind, consider the following.',
 "Donald Page, one of America's eminent cosmologists, has calculated the odds of our universe existing as on the order of one chance out of 1010(123), a number which is so inconceivable that to call it astronomical would be a wild understatement!",
 '{9} Robert Jastrow, the head of NAS

In [45]:
type(df.iloc[[0]])

pandas.core.frame.DataFrame

In [104]:
# j_indices = []
# values = []
# indptr = _make_int_array()
# indptr.append(0)

# feature_dict = _feature_dict('Indonesia cannot afford to become a haven for an Islamic radicalism which would wreck the economy and tear the archipelago apart.')
# j_indices = np.asarray(j_indices, dtype=np.intc)
# indptr = np.frombuffer(indptr, dtype=np.intc)
# targets = np.asarray(targets, dtype=np.intc)

X, targets = fit_transform(df.iloc[[0]], pos_df.iloc[[0]], fixed_features=True)
print(X.shape)
print(classifier.predict(X))

sentence = 'We now know that life-prohibiting universes are vastly more probable than life- permitting universes like ours.'
features = get_features(_feature_dict(sentence, get_pos_tags(sentence), fixed_features=True))
print(classifier.predict(features))
features.shape

(5, 701401)
[0 0 0 0 0]
[1]


(1, 701401)

In [52]:
X.shape

(5, 701402)

In [50]:
len(targets)

5

In [59]:
def kFoldTest(data, pos_data, classifierT = 'multinomialNB'):
#     pipeline = Pipeline([
#             ('vectorizer', Vectorize()),
#             ('classifier', classifier)
#         ])

    kFold = KFold(n = len(data), n_folds = 4)
    scores = []
    p_scores = []
    r_scores = []
    confusionMatrix = np.array([[0]*2]*2)
#     vectorizer = Vectorize()
    #classifier = MultinomialNB()
    #classifier = BernoulliNB()
    #classifier = SVC()
    #classifier = RandomForestClassifier()
    for train_indices, test_indices in kFold:
        train_data = data.iloc[train_indices]
        test_data = data.iloc[test_indices]
        train_pos_data = pos_data.iloc[train_indices]
        test_pos_data = pos_data.iloc[test_indices]

        if(classifierT == 'multinomialNB'):
            classifier = MultinomialNB()
        elif(classifierT == 'maxent'):
            classifier = LogisticRegression()

        initialize_new()

        X, targets = fit_transform(train_data, train_pos_data)
        train_y = targets
        classifier.fit(X, train_y)

        print(X.shape)
        
        X, targets = fit_transform(test_data, test_pos_data, fixed_features=True)

        print(X.shape)

        predictions = classifier.predict(X)
        test_y = targets

        # print(test_y.shape)
        # print(predictions.shape)
        # print(confusion_matrix(test_y, predictions))
        confusionMatrix += confusion_matrix(test_y, predictions)
        score = f1_score(test_y, predictions, average='binary')
        p_score = precision_score(test_y, predictions, average='binary')
        r_score = recall_score(test_y, predictions, average='binary')
        
        p_scores.append(p_score)
        r_scores.append(r_score)        
        scores.append(score)

    print('Score:', sum(scores)/len(scores))
    print('Precision Score:', sum(p_scores)/len(p_scores))
    print('Recall Score:', sum(r_scores)/len(r_scores))
    print('Confusion matrix:')
    print(confusionMatrix)

In [60]:
kFoldTest(df, pos_df, 'multinomialNB')

(4978, 583323)
(1565, 583323)
(4861, 580952)
(1682, 580952)
(4868, 563002)
(1675, 563002)
(4922, 573279)
(1621, 573279)
Score: 0.73806027188
Precision Score: 0.767051730881
Recall Score: 0.71167835531
Confusion matrix:
[[3484  543]
 [ 727 1789]]


In [70]:
kFoldTest(df, classifierT='maxent')

(4978, 583322)
(1565, 583322)
(4861, 580951)
(1682, 580951)
(4868, 563002)
(1675, 563002)
(4922, 573278)
(1621, 573278)
Score: 0.825608135503
Precision Score: 0.827634422406
Recall Score: 0.823856576752
Confusion matrix:
[[3596  431]
 [ 444 2072]]


In [170]:
def try1():
    yield 1, 2
    yield 3, 4
    
def try2():
    return [1, 2]

for i, j in itertools.chain(try1(), try2()):
    print(i, j)

1 2
3 4


TypeError: 'int' object is not iterable

In [171]:
s = 'a,b;c.'.translate(string.punctuation)
s

'a,b;c.'

In [172]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [173]:
list(filter(lambda c: c in s, string.punctuation))

[',', '.', ';']

In [174]:
for s in _ngrams(nltk.word_tokenize('hi I am Varun Raval')):
    print(s)

hi I
I am
am Varun
Varun Raval
hi I am
I am Varun
am Varun Raval


In [42]:
d = defaultdict()
d.default_factory = None
d[1] = 'a'

In [44]:
type(d)

collections.defaultdict

In [45]:
e = {}

In [46]:
type(e)

dict

In [54]:
nltk.pos_tag(nltk.word_tokenize('my friends are a, b, c, and d.'))

[('my', 'PRP$'),
 ('friends', 'NNS'),
 ('are', 'VBP'),
 ('a', 'DT'),
 (',', ','),
 ('b', 'NN'),
 (',', ','),
 ('c', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('d', 'NN'),
 ('.', '.')]

### Using this classifier on another dataset

In [79]:
sources = ['data/schemes/txt']
df1, pos_df1 = get_df(sources)

['data/schemes/json', 'data/schemes/txt']


FileNotFoundError: [Errno 2] No such file or directory: 'data/schemes/txt/nodeset1783.txt'