In [1]:
import spacy
import corenlp
import nltk
from nltk import ngrams
from nltk.tokenize import sent_tokenize,wordpunct_tokenize,word_tokenize,RegexpTokenizer,TweetTokenizer
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import sklearn
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import os,math,glob,re

from collections import Counter
import re as regex
import contractions
import copy

from itertools import chain

nlp = spacy.load('en_core_web_sm')
nlp.max_length=9999999



### Pre-Processing

In [2]:
def contract(text):
    return contractions.fix(text)

def regTokenize(text):
    tok=RegexpTokenizer('[A-Za-z0-9]*[.]?\w+')
    return tok.tokenize(text) 

def lowercase(text):
    return text.lower()

def lemma(words):
    for i in range(0,len(words)):
        words[i]=WordNetLemmatizer().lemmatize(words[i])
    return words

def stemming(words):
    porter_stemmer=PorterStemmer()
    for i in range(0,len(words)):
        words[i]=porter_stemmer.stem(words[i])
    return words

def tweet(words):
    tok=TweetTokenizer()
    return tok.tokenize(words)

def comma(text):
    text = "".join(c for c in text if c not in ('!','.',':',',','"','?','(',')'))
    return text

def getBasicNorm(text):
    for i in range(0,len(text)):
        text[i]=contract(text[i])
        text[i]=lowercase(text[i])
    return text

### Feature Extraction functions

In [3]:
def getWholeString(data):
    string=''
    for i in data:
        string+=i[0]+' '
    return string

def getNGrams(corpusSentence):
    
    corpusSentence=contract(corpusSentence)
    corpusSentence=lowercase(corpusSentence)
    tokenized=regTokenize(corpusSentence)
    
    uni=list(ngrams(tokenized,1))
    bigram=list(ngrams(tokenized,2))
    trigram=list(ngrams(tokenized,3))

    print('uni:',len(uni),' bi:',len(bigram),'tri:',len(trigram))
    grams=[uni,bigram,trigram]
    
    return grams


def postagging(corpusSentence):
    
    corpusSentence=contract(corpusSentence)
    corpusSentence=lowercase(corpusSentence)
#     tokenized=regTokenize(corpusSentence)
    
    doc = nlp(corpusSentence)
#     for i in range(0,5):
#         print(doc[i].pos_,doc[i].lemma_)
    
    return doc

def posPattern(corpusSentence):
    doc = nlp(corpusSentence)
    return doc


# def namedEntities(corpusSentence):
#     corpusSentence=contract(corpusSentence)
#     corpusSentence=lowercase(corpusSentence)
# #     tokenized=regTokenize(corpusSentence)
    
#     doc = nlp(corpusSentence)
# #     for i in range(0,5):
# #         print(doc[i].pos_,doc[i].lemma_)
#     return doc


# -----------------------------------------POS Pattern-----------------------------------------
def getPosPattern(data_sentences):
    sentences_pos=[]
    for i in data_sentences:
        sent=[]
        s=postagging(i)
        for j in s:
            sent.append(j.pos_)
        sentences_pos.append(sent)
    return sentences_pos

def posGrams(sentence):
    
    uni=list(ngrams(sentence,1))
    bigram=list(ngrams(sentence,2))
    trigram=list(ngrams(sentence,3))
    
#     print('uni:',len(uni),' bi:',len(bigram),'tri:',len(trigram))
    grams=[uni,bigram,trigram]
    
    return grams
    
def get34grams(sentences_pos):
    
    three_four_grams=[]
    
    for sent_pos in sentences_pos:
        temp=posGrams(sent_pos)
        temp = list(chain(*temp))
#         print(temp)
        three_four_grams.append(temp)
    
    return three_four_grams

### Dataset Loading

In [4]:
dataset=joblib.load('dataset_with_labels.sav')

print(len(dataset),' sentences')

# print(np.array(dataset)[:,0])
X=np.array(dataset)[:,0] # sentences
Y=np.array(dataset)[:,1] # labels
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

print(len(X_train),' ',len(X_test),' ',len(y_train),' ',len(y_test))

11112  sentences
8889   2223   8889   2223


In [10]:
dataset_fact=[]
dataset_nonfact=[]

for i in dataset:
    if(i[1]==1):
        dataset_fact.append(i)
    else:
        dataset_nonfact.append(i)

print(len(dataset_fact),' ' ,len(dataset_nonfact))


4989   6123


In [26]:
X_facts=np.array(dataset_fact)[:,0]
X_nonfacts=np.array(dataset_nonfact)[:,0]

In [7]:
# for i in dataset_fact:
#     print(i)

In [9]:
# for i in dataset_nonfact:
#     print(i)

### getting wholeSentences for wholedataset, facts, nonfacts as one sentence

In [19]:
sent_as_string=getWholeString(dataset)

facts_as_string=getWholeString(dataset_fact)
nonfacts_as_string=getWholeString(dataset_nonfact)

print(len(sent_as_string) ,' ', len(facts_as_string),' ',len(nonfacts_as_string))

1635553   663169   972384


### getting 'nGrams' for 'facts and nonfacts sentences' and converting to 'set'

In [21]:
facts_n_grams=getNGrams(facts_as_string)
print(len(facts_n_grams))

facts_wholeGrams = list(chain(*facts_n_grams))
print(len(facts_wholeGrams))

facts_wholeGrams_list_set=list(set(facts_wholeGrams))
print(len(facts_wholeGrams_list_set))

# --------------------------------------------------------------------

nonfacts_n_grams=getNGrams(nonfacts_as_string)
print(len(nonfacts_n_grams))

nonfacts_wholeGrams = list(chain(*nonfacts_n_grams))
print(len(nonfacts_wholeGrams))

nonfacts_wholeGrams_list_set=list(set(nonfacts_wholeGrams))
print(len(nonfacts_wholeGrams_list_set))

uni: 108851  bi: 108850 tri: 108849
3
326550
170942
uni: 158694  bi: 158693 tri: 158692
3
476079
225880


In [35]:
grams_toVector=['facts_unigram','facts_bigram','facts_trigram','nonfacts_unigram','nonfacts_bigram','nonfacts_trigram']

### getting pos tags for 'facts and nonfacts sentences' and convert into 'set'

In [22]:
pos=postagging(sent_as_string)

print(len(pos))
pos_tag=[]

for i in pos:
    pos_tag.append(i.pos_)
pos_tag_list_set=list(set(pos_tag))
print(len(pos_tag_list_set))

309474
16


### getting 'named entities' for 'facts and nonfacts sentences' and convert into 'set'

In [23]:
# for i in pos.ents:
#     print(i.text, i.start_char, i.end_char, i.label_)

named_entity=[]
for i in pos.ents:
    named_entity.append(i.label_)

named_entity_list_set=set(named_entity)
print(len(named_entity),'-->', len(named_entity_list_set))

15024 --> 17


### getting 'POS Pattern' and getting '3,4-grams' for each sentence and convert into 'set'

In [27]:
wholeX=copy.deepcopy(X)
print(len(wholeX))

wholeX=getBasicNorm(wholeX)
# ----------------------------------------------------


wholeXFacts=copy.deepcopy(X_facts)
print(len(wholeXFacts))

wholeXFacts=getBasicNorm(wholeXFacts)

# ----------------------------------------------------
wholeXNonFacts=copy.deepcopy(X_nonfacts)
print(len(wholeXNonFacts))

wholeXNonFacts=getBasicNorm(wholeXNonFacts)


11112
4989
6123


In [28]:
# posPattern=getPosPattern(wholeX)
# ----------------------------------------------------

facts_posPattern=getPosPattern(wholeXFacts)
# ----------------------------------------------------

nonfacts_posPattern=getPosPattern(wholeXNonFacts)


In [29]:
# for i in posPattern:
#     print(i)
#     break
# print(len(posPattern))

# ----------------------------------------------------

for i in facts_posPattern:
    print(i)
    break
print(len(facts_posPattern))
# ----------------------------------------------------

for i in nonfacts_posPattern:
    print(i)
    break
print(len(nonfacts_posPattern))

['ADP', 'NUM', 'CCONJ', 'NUM', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'VERB', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADV', 'PUNCT', 'ADP', 'ADV', 'NUM', 'PART', 'NUM', 'NOUN', 'ADP', 'NOUN', 'VERB', 'PUNCT', 'VERB', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'PUNCT']
4989
['ADJ', 'VERB', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'VERB', 'VERB', 'ADJ', 'ADV', 'PUNCT']
6123


In [31]:
# pos_pattern_3_4=get34grams(posPattern)
# print(len(pos_pattern_3_4))

# pos_pattern_3_4_list=list(chain(*pos_pattern_3_4))
# print(len(pos_pattern_3_4_list))

# pos_pattern_3_4_list_set=list(set(pos_pattern_3_4_list))
# print(len(pos_pattern_3_4_list_set))

# ----------------------------------------------------


facts_pos_pattern_3_4=get34grams(facts_posPattern)
print(len(facts_pos_pattern_3_4))

facts_pos_pattern_3_4_list=list(chain(*facts_pos_pattern_3_4))
print(len(facts_pos_pattern_3_4_list))

facts_pos_pattern_3_4_list_set=list(set(facts_pos_pattern_3_4_list))
print(len(facts_pos_pattern_3_4_list_set),'\n')

# ----------------------------------------------------

nonfacts_pos_pattern_3_4=get34grams(nonfacts_posPattern)
print(len(nonfacts_pos_pattern_3_4))

nonfacts_pos_pattern_3_4_list=list(chain(*nonfacts_pos_pattern_3_4))
print(len(nonfacts_pos_pattern_3_4_list))

nonfacts_pos_pattern_3_4_list_set=list(set(nonfacts_pos_pattern_3_4_list))
print(len(nonfacts_pos_pattern_3_4_list_set))

4989
362975
2051 

6123
532125
2103


In [36]:
pos_toVector=['facts_posPattern_3gram','facts_posPattern_4gram','nonfacts_posPattern_3gram','nonfacts_posPattern_4gram',]

### Sentiment Extractor

In [39]:
sentiment_scores=['_Sentiment_Feature_']


### TPattern

In [51]:
Tcases=[]
for i in range(1,6):
    cases='_case_'+str(i)+'_'
    Tcases.append(cases)
print(Tcases)

['_case_1_', '_case_2_', '_case_3_', '_case_4_', '_case_5_']


### ASPattern

### Adding into 'Vector'

In [52]:
vector=[]

# Ngrams features
# for i in wholeGrams_list_set:
#     vector.append(i)
# print('wholeGrams_list_set:',len(vector))

for i in grams_toVector:
    vector.append(i)
print('grams_toVector:',len(vector))

# ----------------------------------------------------

# pos Tagging features
for i in pos_tag_list_set:
    vector.append(i)
print('pos_tag_list_set:',len(vector))

# ----------------------------------------------------

# Named Entities features
for i in named_entity_list_set:
    vector.append(i)
print('named_entity_list_set:',len(vector))

# ----------------------------------------------------

# posPattern features
# for i in pos_pattern_3_4_list_set:
#     vector.append(i)
# print('pos_pattern_3_4_list_set:',len(vector))

for i in pos_toVector:
    vector.append(i)
print('pos_toVector:',len(vector))

# ----------------------------------------------------

# Tpattern cases features
for i in Tcases:
    vector.append(i)
print('Tpattern:',len(vector))

# ----------------------------------------------------

# Sentiment Scores features
for i in sentiment_scores:
    vector.append(i)
print('sentiment_scores:',len(vector))

grams_toVector: 6
pos_tag_list_set: 22
named_entity_list_set: 39
pos_toVector: 43
Tpattern: 48
sentiment_scores: 49


In [53]:
print(vector)

['facts_unigram', 'facts_bigram', 'facts_trigram', 'nonfacts_unigram', 'nonfacts_bigram', 'nonfacts_trigram', 'ADV', 'CCONJ', 'VERB', 'NOUN', 'ADJ', 'X', 'ADP', 'NUM', 'PUNCT', 'PROPN', 'INTJ', 'DET', 'PART', 'PRON', 'SYM', 'SPACE', 'PERCENT', 'ORG', 'NORP', 'LAW', 'QUANTITY', 'DATE', 'MONEY', 'FAC', 'EVENT', 'TIME', 'PRODUCT', 'LANGUAGE', 'LOC', 'CARDINAL', 'ORDINAL', 'GPE', 'PERSON', 'facts_posPattern_3gram', 'facts_posPattern_4gram', 'nonfacts_posPattern_3gram', 'nonfacts_posPattern_4gram', '_case_1_', '_case_2_', '_case_3_', '_case_4_', '_case_5_', '_Sentiment_Feature_']


In [54]:
vector_file='vector_dimensions_reduced.sav'
joblib.dump(vector,vector_file)

['vector_dimensions_reduced.sav']

In [55]:
vector_load=joblib.load('vector_dimensions_reduced.sav')

In [56]:
len(vector_load)

49

In [57]:
for i in vector_load:
    print(i)

facts_unigram
facts_bigram
facts_trigram
nonfacts_unigram
nonfacts_bigram
nonfacts_trigram
ADV
CCONJ
VERB
NOUN
ADJ
X
ADP
NUM
PUNCT
PROPN
INTJ
DET
PART
PRON
SYM
SPACE
PERCENT
ORG
NORP
LAW
QUANTITY
DATE
MONEY
FAC
EVENT
TIME
PRODUCT
LANGUAGE
LOC
CARDINAL
ORDINAL
GPE
PERSON
facts_posPattern_3gram
facts_posPattern_4gram
nonfacts_posPattern_3gram
nonfacts_posPattern_4gram
_case_1_
_case_2_
_case_3_
_case_4_
_case_5_
_Sentiment_Feature_


## Feature Extraction

### Ngrams

### pos Tagging

### Named Entities

### pos Pattern

### TPattern

### Sentiment Scores

### ASpattern