In [2]:
import os, tarfile, pandas, json, itertools, string, array, pickle
import nltk.data, nltk
import numpy as np
from scipy.sparse import csr_matrix
from collections import defaultdict
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, precision_score, average_precision_score, fbeta_score, recall_score



In [2]:
def get_pos_tags(sentence):
    return nltk.pos_tag(nltk.word_tokenize(sentence))

In [3]:
def read_files(sources):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    for source in sources:
        print(source)
        sourceJ = source[0]
        sourceT = source[1]
        for root, dir_names, file_names in os.walk(sourceJ):
            for file_name in file_names:
                try:
                    file = open(os.path.join(sourceT, file_name[:-5]+'.txt'))
                    content = tokenizer.tokenize(file.read())
                except:
                    file = open(os.path.join(sourceT, file_name[:-5]+'.txt'), encoding='windows-1252')
                    content = tokenizer.tokenize(file.read())
                vals = []
                encoding='utf-8'
                while True:
                    try:
                        for line in open(os.path.join(root, file_name), encoding=encoding):
                            for node in json.loads(line)['nodes']:
                                vals.append(node['text'])
                        break
                    except:
                        encoding='windows-1252'

                r_val = [1]*len(vals)
                args= []
                non_args = []
                pos_args = []
                pos_non_args = []
                for con in content:
                    is_args = False
                    i = 0
                    for val in vals:
                        if val in con:
                            is_args = True
                            r_val[i] = 0
                            break
                        i+=1
                    pos_con = nltk.pos_tag(nltk.word_tokenize(con))
                    if(is_args):
                        args.append(con)
                        pos_args.append(pos_con)
                    else:
                        non_args.append(con)
                        pos_non_args.append(pos_con)

                i = 0
                for val in vals:
                    if(r_val[i] and val!='RA'):
                        pos_val = nltk.pos_tag(nltk.word_tokenize(val))
                        args.append(val)
                        pos_args.append(pos_val)
                    i+=1

                yield file_name, args, non_args, pos_args, pos_non_args

In [4]:
"""
file = open('output.txt', 'w')
for file_name, args, non_args in read_files():
    s = file_name, ':', args, ':', non_args
    file.write(str(s)+"\n")
"""
def get_df(sources):
    global headings 
    headings = ['arguments', 'non arguments']
    index = []
    data = []
    pos_data = []
    for file_name, args, non_args, pos_args, pos_non_args in read_files(sources):
        index.append(file_name)
        data.append([args, non_args])
        pos_data.append([pos_args, pos_non_args])

    df = pandas.DataFrame(index = index, data = data, columns = headings)
    pos_df = pandas.DataFrame(index = index, data = pos_data, columns = headings)
    return df, pos_df

In [5]:
sources = [['data/araucaria/json', 'data/araucaria/txt']]
df, pos_df = get_df(sources)

['data/araucaria/json', 'data/araucaria/txt']


In [6]:
adverbs = ['RB', 'RBR', 'RBS']
verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
verbs_remove = ['to be', 'to do', 'to have']
#nltk.help.upenn_tagset()

In [7]:
def _make_int_array():
    return array.array(str("i"))

In [8]:
def _ngrams(tokens):
    o_tokens = tokens
    tokens = []
    n_o_tokens = len(o_tokens)
    
    for n in range(2,4):
        for j in range(n_o_tokens - n + 1):
            yield ' '.join(o_tokens[j:j+n])

In [9]:
def _couples(tokens):
    length = len(tokens)
    for i in range(length-1):
        for j in range(i+1, length):
            yield 'c_{} {}'.format(tokens[i], tokens[j])

In [10]:
def _adverbs(pos_tags):
    p = False
    for word, tag in pos_tags:
        if tag in adverbs:
            p = True
            yield 'adv_' + word

    if not p:
        yield 'no_adverbs'

In [11]:
def _verbs(pos_tags):
    p = False
    for word, tag in pos_tags:
        if tag in verbs and word not in verbs_remove:
            p = True
            yield 'v_' + word
            
    if not p:
        yield 'no_verbs'

In [12]:
def _modal_aux(pos_tags):
    for word, tag in pos_tags:
        if tag == 'MD':
            return 'MD_1', 1
    return 'MD_1', 0

In [13]:
def _sen_len(tokens):
    yield 's_len', len(tokens)

In [14]:
def _avg_word_len(tokens):
    c_len = 0
    for token in tokens:
        c_len += len(token)
    yield 'avg_word_len', round(c_len/len(tokens))

In [15]:
def _punc_data(s):
    punc_list = list(filter(lambda c: c in s, string.punctuation))
    punc_len = len(punc_list)
    yield 'punc_len', punc_len
    single_punc = []
    i = 0
    while i < punc_len:
        single_punc.append(punc_list[i])
        last = punc_list[i]
        i += 1
        while(i < punc_len and punc_list[i] == last):
            i += 1
            
    yield ''.join(single_punc), 1

In [16]:
def initialize_new_features():
    features_ = defaultdict()
    features_.default_factory = features_.__len__
    return features_

In [17]:
def _feature_dict(features_, sentence, pos_tags, fixed_features):
    typeF = ''
    if fixed_features == True:
        features = dict(features_)
        typeF = 'dict'
    else:
        features = features_
        typeF = 'defaultDict'

    feature_dict = {}
    tokens = nltk.word_tokenize(sentence)
    if len(tokens) > 2:
#         pos_tags = nltk.pos_tag(tokens)

        tag, val = _modal_aux(pos_tags)
        if typeF == 'dict':
            if tag in features:
                feature_num = features[tag]
                feature_dict[feature_num] = val
        else:
            feature_num = features[tag]
            feature_dict[feature_num] = val

        for tag, val in itertools.chain(_sen_len(tokens), _avg_word_len(tokens), _punc_data(sentence)):
            if tag != '':
                if typeF == 'dict':
                    if tag in features:
                        feature_num = features[tag]
                        feature_dict[feature_num] = val
                else:
                    feature_num = features[tag]
                    feature_dict[feature_num] = val

        for token in itertools.chain(_adverbs(pos_tags), _verbs(pos_tags), tokens, _ngrams(tokens), _couples(tokens)):
            if typeF == 'dict':
                if token in features:
                    feature_num = features[token]
                    if feature_num not in feature_dict:
                        feature_dict[feature_num] = 1
                    else:
                        feature_dict[feature_num] += 1
            else:
                feature_num = features[token]
                if feature_num not in feature_dict:
                    feature_dict[feature_num] = 1
                else:
                    feature_dict[feature_num] += 1

    return feature_dict

In [18]:
def get_features(features_, feature_dict):
    features = np.zeros(len(features_))
    for key in feature_dict:
        features[key] = feature_dict[key]
    
    return features.reshape(1, -1)

In [19]:
def save_features(features_):
    f = open('features_argument_classifier.txt', 'w')
    for key in features_:
        f.write(key + ":-" + str(features_[key]) + "\n")

    f.close()

In [20]:
def load_features(file='features_argument_classifier.txt'):
    f = open(file, 'r')
    features_ = defaultdict()
    for line in f.read().splitlines():
        key_value = line.split(":-")
        if len(key_value) == 2:
            features_[key_value[0]] = int(key_value[1])
    
    features_.default_factory = features_.__len__
    return features_

In [21]:
def fit_transform(dataFrame, pos_dataFrame, features_=initialize_new_features(), fixed_features=False):
    j_indices = []
    indptr = _make_int_array()
    values = []
    indptr.append(0)
    targets = []    
    for i, heading in enumerate(headings):
        for j, content in enumerate(dataFrame[heading]):
            for k, sentence in enumerate(content):
                feature_dict = _feature_dict(features_, sentence, pos_dataFrame[heading][j][k], fixed_features)
                # print(sentence, feature_dict)
                if len(feature_dict) != 0:
                    targets.append(i)
                    j_indices.extend(feature_dict.keys())
                    values.extend(feature_dict.values())
                    indptr.append(len(j_indices))

    j_indices = np.asarray(j_indices, dtype=np.intc)
    indptr = np.frombuffer(indptr, dtype=np.intc)
    targets = np.asarray(targets, dtype=np.intc)
    X = csr_matrix((values, j_indices, indptr), shape = (len(indptr) - 1, len(features_)), dtype=np.int64)
    return X, targets, features_

In [22]:
X, targets, features_ = fit_transform(df, pos_df)

In [23]:
classifier_mul = MultinomialNB()
classifier_mul.fit(X, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
classifier_maxent = LogisticRegression()
classifier_maxent.fit(X, targets)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
save_features(features_)
features_try = load_features()

### save classifier state

In [35]:
def save_classifier(classifier, file='arguent_classifier_multinomialNB.pickle', type_classifier='multinomialNB'):
    if type_classifier == 'multinomialNB':
        f = open(file, 'wb')
    elif type_classifier == 'maxent':
        f = open('arguent_classifier_maxent.pickle', 'wb')

    pickle.dump(classifier, f)
    f.close()

In [36]:
def load_classifier(file='arguent_classifier_multinomialNB.pickle', type_classifier='multinomialNB'):
    if type_classifier == 'multinomialNB':
        f = open(file, 'rb')
    elif type_classifier == 'maxent':
        f = open('arguent_classifier_maxent.pickle', 'rb')

    return pickle.load(f)

In [45]:
# j_indices = []
# values = []
# indptr = _make_int_array()
# indptr.append(0)

# feature_dict = _feature_dict('Indonesia cannot afford to become a haven for an Islamic radicalism which would wreck the economy and tear the archipelago apart.')
# j_indices = np.asarray(j_indices, dtype=np.intc)
# indptr = np.frombuffer(indptr, dtype=np.intc)
# targets = np.asarray(targets, dtype=np.intc)

sentence = 'We now know that life-prohibiting universes are vastly more probable than life- permitting universes like ours.'
features = get_features(features_, _feature_dict(features_, sentence, get_pos_tags(sentence), fixed_features=True))
print(classifier.predict(features))

[1]


In [37]:
save_classifier(classifier_mul)
save_classifier(classifier_maxent, type_classifier='maxent')

In [30]:
def kFoldTest(data, pos_data, classifierT = 'multinomialNB'):
#     pipeline = Pipeline([
#             ('vectorizer', Vectorize()),
#             ('classifier', classifier)
#         ])

    kFold = KFold(n = len(data), n_folds = 4)
    scores = []
    p_scores = []
    r_scores = []
    confusionMatrix = np.array([[0]*2]*2)
#     vectorizer = Vectorize()
    #classifier = MultinomialNB()
    #classifier = BernoulliNB()
    #classifier = SVC()
    #classifier = RandomForestClassifier()
    for train_indices, test_indices in kFold:
        train_data = data.iloc[train_indices]
        test_data = data.iloc[test_indices]
        train_pos_data = pos_data.iloc[train_indices]
        test_pos_data = pos_data.iloc[test_indices]

        if(classifierT == 'multinomialNB'):
            classifier = MultinomialNB()
        elif(classifierT == 'maxent'):
            classifier = LogisticRegression()

        X, targets, features_ = fit_transform(train_data, train_pos_data)
        train_y = targets
        classifier.fit(X, train_y)

        print(X.shape)
        
        X, targets = fit_transform(test_data, test_pos_data, features_=features_, fixed_features=True)

        print(X.shape)

        predictions = classifier.predict(X)
        test_y = targets

        # print(test_y.shape)
        # print(predictions.shape)
        # print(confusion_matrix(test_y, predictions))
        confusionMatrix += confusion_matrix(test_y, predictions)
        score = f1_score(test_y, predictions, average='binary')
        p_score = precision_score(test_y, predictions, average='binary')
        r_score = recall_score(test_y, predictions, average='binary')
        
        p_scores.append(p_score)
        r_scores.append(r_score)        
        scores.append(score)

    print('Score:', sum(scores)/len(scores))
    print('Precision Score:', sum(p_scores)/len(p_scores))
    print('Recall Score:', sum(r_scores)/len(r_scores))
    print('Confusion matrix:')
    print(confusionMatrix)

In [60]:
kFoldTest(df, pos_df, 'multinomialNB')

(4978, 583323)
(1565, 583323)
(4861, 580952)
(1682, 580952)
(4868, 563002)
(1675, 563002)
(4922, 573279)
(1621, 573279)
Score: 0.73806027188
Precision Score: 0.767051730881
Recall Score: 0.71167835531
Confusion matrix:
[[3484  543]
 [ 727 1789]]


In [70]:
kFoldTest(df, classifierT='maxent')

(4978, 583322)
(1565, 583322)
(4861, 580951)
(1682, 580951)
(4868, 563002)
(1675, 563002)
(4922, 573278)
(1621, 573278)
Score: 0.825608135503
Precision Score: 0.827634422406
Recall Score: 0.823856576752
Confusion matrix:
[[3596  431]
 [ 444 2072]]


### Using this classifier on another dataset

In [50]:
# sources = ['data/schemes/txt']
def classify_data(sources, features_, classifer):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')    
    index = []
    data = []
    for source in sources:
        for root, dir_names, file_names in os.walk(source):
            for file_name in file_names:
                try:
                    file = open(os.path.join(root, file_name))
                    content = tokenizer.tokenize(file.read())
                except:
                    file = open(os.path.join(root, file_name), encoding='windows-1252')
                    content = tokenizer.tokenize(file.read())

                lines = []
                for line in content:
                    features = get_features(features_, _feature_dict(features_, line, get_pos_tags(line), fixed_features=True))
                    result = classifier.predict(features)
                    lines.append([line, result])

                index.append(source + "/" + file_name)
                data.append(lines)

    df = pandas.DataFrame(index=index, data=data)
    return df

In [53]:
sources = ['data/schemes/txt']
classify_data(sources, load_features(), load_classifier())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
data/schemes/txt/nodeset1724.txt,"[Claire Fox:\tI understand that., [1]]",[I suppose my concern is just this: I\nwant th...,"[Thank you.” \n\nAnd so, if you\nwant the mora...","[There’s no discipline\nthere., [1]]","[In some ways you need that discipline, don’t ...",[Nick Dearden:\tIn some ways I agree with you....,"[If you want the economy\nto run smoothly, you...","[So, for example in\nSouth Korea, in terms of ...","[On the other hand,\nI think what people don’t...",[But\nit certainly isn’t the debts of the peop...,...,"[That’s my point., [1]]","[And then finally,\nif I could just ask you, t...",[The rich are the bad guys.”\n\nIt is the case...,[Isn’t\nthe problem that the only thing the ba...,"[What they should have been doing, what they\n...",[They should actually be taking more risk and ...,"[Nick Dearden:\tFor productive investments., [1]]","[I agree with you., [1]]",,
data/schemes/txt/nodeset1627.txt,[\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nClifford Lo...,"[Aren’t the people you\nrepresent, nowadays ca...",[And isn’t the reason Dante confined them to t...,"[John Lamiday: So, lending money at interest i...","[Well, then that takes away most of the abilit...",,,,,,...,,,,,,,,,,
data/schemes/txt/nodeset1629.txt,[\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nClifford\nL...,[Only a desperate person would agree to\npay t...,"[So this activity is, in fact, taking a very u...","[John\nLamiday: Oh, I don’t think\nthat’s true...","[You don’t pay an interest rate., [0]]",[The interest rate varies by\nthe amount of th...,"[So if you borrow some money from me, and you ...",[But the amount it costs you is exactly the sa...,,,...,,,,,,,,,,
data/schemes/txt/nodeset1716.txt,"[Well, at the moment the government has decide...",[Because the economy is in such a terrible sta...,"[Now, fair enough,\nin a crisis., [1]]",[But the problem is that’s been the policy for...,,,,,,,...,,,,,,,,,,
data/schemes/txt/nodeset1745.txt,"[Claire Fox:\tYes, but I'm suggesting that tha...",[All I'm saying is that I think\nthe people wh...,"[When you think\nabout it now, I actually find...",[And people sort of\nassume like everything ca...,"[“Well, do we have to have\ncuts?, [1]]","[Do we have to have this?, [0]]",[Do we have to have that?” And you do\nactuall...,,,,...,,,,,,,,,,
data/schemes/txt/nodeset1711.txt,"[I dare say they do., [0]]","[But if you were a Martian, and you descended ...",[Have you\nlooked at what interest rates are a...,[Nick Dearden:\tBut how can you say that when ...,"[And indeed the debts that are owed now, by us...",[There is absolute impunity for lenders\nat th...,"[Michael Portillo:\tWell, you say the banks ar...","[I mean everyone who\nputs money into a bank, ...",[I mean you just\nheard Simon Rose say that th...,[So if you're going to raise your banner for\n...,...,,,,,,,,,,
data/schemes/txt/nodeset1741.txt,"[Matthew Taylor:\tBut, hold on., [1]]",[Isn’t it simply the case that buying\na cup o...,[Therefore isn’t it a\nreasonable thing for th...,[Jamie Whyte:\tIt’s only since the state inter...,"[In the old\ndays of early capitalism, bankers...",[And\nthey went to great lengths to advertise ...,"[This stopped not\nbecause of the big bang, wh...",[It stopped because of government guarantees t...,,,...,,,,,,,,,,
data/schemes/txt/nodeset5975.txt,[\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<span class...,"[<span class=""highlighted"" id=""node16"">That se...","[<span class=""highlighted"" id=""node25"">It\nrai...","[<span class=""highlighted"" id=""node40"">And thi...",[</span>RC] [because of the next sentence]<spa...,"[<span class=""highlighted"" id=""node67"">I think...","[<span class=""highlighted"" id=""node92"">It’s\nb...",,,,...,,,,,,,,,,
data/schemes/txt/nodeset4706.txt,[http://www.theonering.net/torwp/2015/05/16/98...,,,,,,,,,,...,,,,,,,,,,
data/schemes/txt/nodeset1635.txt,[\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMatthew\nTa...,"[Michael\nBuerk: Let him finish Matthew,\nplea...","[John\nLamiday: Well, I can tell you\nwhy: bec...","[That led to new regulations in\n2004, new reg...",[So these aspects have been looked at in\nhuge...,"[So what have you missed?\n\n, [1]]",,,,,...,,,,,,,,,,


In [3]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True