In [1]:
import spacy
import corenlp
import nltk
from nltk import ngrams
from nltk.tokenize import sent_tokenize,wordpunct_tokenize,word_tokenize,RegexpTokenizer,TweetTokenizer
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import sklearn
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import os,math,glob,re

from collections import Counter
import re as regex
import contractions
import copy

from itertools import chain

nlp = spacy.load('en_core_web_sm')
nlp.max_length=9999999



### Pre-Processing

In [2]:
def contract(text):
    return contractions.fix(text)

def regTokenize(text):
    tok=RegexpTokenizer('[A-Za-z0-9]*[.]?\w+')
    return tok.tokenize(text) 

def lowercase(text):
    return text.lower()

def lemma(words):
    for i in range(0,len(words)):
        words[i]=WordNetLemmatizer().lemmatize(words[i])
    return words

def stemming(words):
    porter_stemmer=PorterStemmer()
    for i in range(0,len(words)):
        words[i]=porter_stemmer.stem(words[i])
    return words

def tweet(words):
    tok=TweetTokenizer()
    return tok.tokenize(words)

def comma(text):
    text = "".join(c for c in text if c not in ('!','.',':',',','"','?','(',')'))
    return text

def getBasicNorm(text):
    for i in range(0,len(text)):
        text[i]=contract(text[i])
        text[i]=lowercase(text[i])
    return text

### Feature Extraction functions

In [3]:
def getWholeString(data):
    string=''
    for i in data:
        string+=i[0]+' '
    return string

def getNGrams(corpusSentence):
    
    corpusSentence=contract(corpusSentence)
    corpusSentence=lowercase(corpusSentence)
    tokenized=regTokenize(corpusSentence)
    
    uni=list(ngrams(tokenized,1))
    bigram=list(ngrams(tokenized,2))
    trigram=list(ngrams(tokenized,3))

    print('uni:',len(uni),' bi:',len(bigram),'tri:',len(trigram))
    grams=[uni,bigram,trigram]
    
    return grams


def postagging(corpusSentence):
    
    corpusSentence=contract(corpusSentence)
    corpusSentence=lowercase(corpusSentence)
#     tokenized=regTokenize(corpusSentence)
    
    doc = nlp(corpusSentence)
#     for i in range(0,5):
#         print(doc[i].pos_,doc[i].lemma_)
    
    return doc

def posPattern(corpusSentence):
    doc = nlp(corpusSentence)
    return doc


# def namedEntities(corpusSentence):
#     corpusSentence=contract(corpusSentence)
#     corpusSentence=lowercase(corpusSentence)
# #     tokenized=regTokenize(corpusSentence)
    
#     doc = nlp(corpusSentence)
# #     for i in range(0,5):
# #         print(doc[i].pos_,doc[i].lemma_)
#     return doc


# -----------------------------------------POS Pattern-----------------------------------------
def getPosPattern(data_sentences):
    sentences_pos=[]
    for i in data_sentences:
        sent=[]
        s=postagging(i)
        for j in s:
            sent.append(j.pos_)
        sentences_pos.append(sent)
    return sentences_pos

def posGrams(sentence):
    
    trigram=list(ngrams(sentence,3))
    fourgram=list(ngrams(sentence,4))
    
#     print('uni:',len(uni),' bi:',len(bigram),'tri:',len(trigram))
    grams=[trigram,fourgram]
    
    return grams
    
def get34grams(sentences_pos):
    
    three_four_grams=[]
    
    for sent_pos in sentences_pos:
        temp=posGrams(sent_pos)
        temp = list(chain(*temp))
#         print(temp)
        three_four_grams.append(temp)
    
    return three_four_grams

### Dataset Loading

In [4]:
dataset=joblib.load('dataset_with_labels.sav')

print(len(dataset),' sentences')

# print(np.array(dataset)[:,0])
X=np.array(dataset)[:,0] # sentences
Y=np.array(dataset)[:,1] # labels
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=20)

print(len(X_train),' ',len(X_test),' ',len(y_train),' ',len(y_test))

11112  sentences
8889   2223   8889   2223


In [9]:
dataset_fact=[]
dataset_nonfact=[]

for i in dataset:
    if(i[1]==1):
        dataset_fact.append(i)
    else:
        dataset_nonfact.append(i)

print(len(dataset_fact),' ' ,len(dataset_nonfact))

X_train_facts=[]
X_train_nonfacts=[]
for i in range(len(X_train)):
    temp=[]
    if(y_train[i]=='1'):
        temp.append(X_train[i])
        temp.append(1)
        X_train_facts.append(temp)
    else:
        temp.append(X_train[i])
        temp.append(0)
        X_train_nonfacts.append(temp)

print(len(X_train_facts),' ',len(X_train_nonfacts))
        

4989   6123
3967   4922


In [10]:
X_facts=np.array(dataset_fact)[:,0]
X_nonfacts=np.array(dataset_nonfact)[:,0]


### getting wholeSentences for wholedataset, facts, nonfacts as one sentence

In [11]:
sent_as_string=getWholeString(dataset)

facts_as_string=getWholeString(dataset_fact)
nonfacts_as_string=getWholeString(dataset_nonfact)

train_facts_as_string=getWholeString(X_train_facts)
train_nonfacts_as_string=getWholeString(X_train_nonfacts)


print(len(sent_as_string) ,' ', len(facts_as_string),' ',len(nonfacts_as_string),' ',len(train_facts_as_string),' ',len(train_nonfacts_as_string))


1635553   663169   972384   528863   783790


### getting 'nGrams' for 'facts and nonfacts sentences' and converting to 'set'

In [15]:
facts_n_grams = getNGrams(facts_as_string)
facts_unigram, facts_bigram, facts_trigram = list(set(facts_n_grams[0])), list(set(facts_n_grams[1])), list(set(facts_n_grams[2]))
print(len(facts_unigram), ' ', len(facts_bigram), ' ', len(facts_trigram),'\n===================')

nonfacts_n_grams = getNGrams(nonfacts_as_string)
nonfacts_unigram, nonfacts_bigram, nonfacts_trigram = list(set(nonfacts_n_grams[0])), list(set(nonfacts_n_grams[1])), list(set(nonfacts_n_grams[2]))
print(len(nonfacts_unigram), ' ', len(nonfacts_bigram), ' ', len(nonfacts_trigram),'\n===================')
# --------------------------------------------------------------------

train_facts_n_grams = getNGrams(train_facts_as_string)
train_facts_unigram, train_facts_bigram,train_facts_trigram=list(set(train_facts_n_grams[0])),list(set(train_facts_n_grams[1])),list(set(train_facts_n_grams[2]))
print(len(train_facts_unigram), ' ', len(train_facts_bigram), ' ', len(train_facts_trigram),'\n===================')

train_nonfacts_n_grams = getNGrams(train_nonfacts_as_string)
train_nonfacts_unigram, train_nonfacts_bigram,train_nonfacts_trigram=list(set(train_nonfacts_n_grams[0])),list(set(train_nonfacts_n_grams[1])),list(set(train_nonfacts_n_grams[2]))
print(len(train_nonfacts_unigram), ' ', len(train_nonfacts_bigram), ' ', len(train_nonfacts_trigram),'\n===================')

uni: 108851  bi: 108850 tri: 108849
11292   63621   96029 
uni: 158694  bi: 158693 tri: 158692
12411   80501   132968 
uni: 86806  bi: 86805 tri: 86804
10164   53161   78045 
uni: 127875  bi: 127874 tri: 127873
11310   68124   109043 


In [17]:
facts_wholeGrams = list(chain(*facts_n_grams))
facts_wholeGrams_list_set=list(set(facts_wholeGrams))
print(len(facts_wholeGrams_list_set))

nonfacts_wholeGrams = list(chain(*nonfacts_n_grams))
nonfacts_wholeGrams_list_set=list(set(nonfacts_wholeGrams))
print(len(nonfacts_wholeGrams_list_set))

# --------------------------------------------------------------------

train_facts_wholeGrams=list(chain(*train_facts_n_grams))
train_facts_wholeGrams_list_set=list(set(train_facts_wholeGrams))
print(len(train_facts_wholeGrams_list_set))

train_nonfacts_wholeGrams=list(chain(*train_nonfacts_n_grams))
train_nonfacts_wholeGrams_list_set=list(set(train_nonfacts_wholeGrams))
print(len(train_nonfacts_wholeGrams_list_set))

170942
225880
141370
188477


In [18]:
grams_toVector=['facts_unigram','facts_bigram','facts_trigram','nonfacts_unigram','nonfacts_bigram','nonfacts_trigram']

### getting pos tags and convert into 'set'

In [20]:
pos=postagging(sent_as_string)

print(len(pos))
pos_tag=[]

for i in pos:
    pos_tag.append(i.pos_)
pos_tag_list_set=list(set(pos_tag))
print(len(pos_tag_list_set))

309474
16


### getting 'named entities' for 'facts and nonfacts sentences' and convert into 'set'

In [None]:
# for i in pos.ents:
#     print(i.text, i.start_char, i.end_char, i.label_)

named_entity=[]
for i in pos.ents:
    named_entity.append(i.label_)

named_entity_list_set=list(set(named_entity))
print(len(named_entity),'-->', len(named_entity_list_set))

In [None]:
print(named_entity_list_set)

### getting 'POS Pattern' and getting '3,4-grams' for each sentence and convert into 'set'

In [None]:
wholeX=copy.deepcopy(X)
wholeX=getBasicNorm(wholeX)
# ----------------------------------------------------

wholeXFacts=copy.deepcopy(X_facts)
wholeXFacts=getBasicNorm(wholeXFacts)

# ----------------------------------------------------
wholeXNonFacts=copy.deepcopy(X_nonfacts)
wholeXNonFacts=getBasicNorm(wholeXNonFacts)

# ----------------------------------------------------

wholeXTrainFacts=copy.deepcopy(X_facts)
wholeXTrainFacts=getBasicNorm(wholeXTrainFacts)

In [None]:
facts_posPattern=getPosPattern(wholeXFacts)
# ----------------------------------------------------

nonfacts_posPattern=getPosPattern(wholeXNonFacts)

In [None]:
for i in facts_posPattern:
    print(i)
    break
print(len(facts_posPattern))
# ----------------------------------------------------

for i in nonfacts_posPattern:
    print(i)
    break
print(len(nonfacts_posPattern))

In [None]:
facts_pos_pattern_3_4=get34grams(facts_posPattern)
print(len(facts_pos_pattern_3_4))

facts_pos_pattern_3_4_list=list(chain(*facts_pos_pattern_3_4))
print(len(facts_pos_pattern_3_4_list))

facts_pos_pattern_3_4_list_set=list(set(facts_pos_pattern_3_4_list))
print(len(facts_pos_pattern_3_4_list_set),'\n')

# ----------------------------------------------------

nonfacts_pos_pattern_3_4=get34grams(nonfacts_posPattern)
print(len(nonfacts_pos_pattern_3_4))

nonfacts_pos_pattern_3_4_list=list(chain(*nonfacts_pos_pattern_3_4))
print(len(nonfacts_pos_pattern_3_4_list))

nonfacts_pos_pattern_3_4_list_set=list(set(nonfacts_pos_pattern_3_4_list))
print(len(nonfacts_pos_pattern_3_4_list_set))

In [None]:
pos_toVector=['facts_posPattern_3gram','facts_posPattern_4gram','nonfacts_posPattern_3gram','nonfacts_posPattern_4gram',]

### Sentiment Extractor

In [None]:
sentiment_scores=['_Sentiment_Feature_']


### TPattern

In [None]:
Tcases=[]
for i in range(1,6):
    cases='_case_'+str(i)+'_'
    Tcases.append(cases)
print(Tcases)

### ASPattern

### Adding into 'Vector'

In [None]:
vector=[]

# Ngrams features
# for i in wholeGrams_list_set:
#     vector.append(i)
# print('wholeGrams_list_set:',len(vector))

for i in grams_toVector:
    vector.append(i)
print('grams_toVector:',len(vector))

# ----------------------------------------------------

# pos Tagging features
for i in pos_tag_list_set:
    vector.append(i)
print('pos_tag_list_set:',len(vector))

# ----------------------------------------------------

# Named Entities features
for i in named_entity_list_set:
    vector.append(i)
print('named_entity_list_set:',len(vector))

# ----------------------------------------------------

# posPattern features
# for i in pos_pattern_3_4_list_set:
#     vector.append(i)
# print('pos_pattern_3_4_list_set:',len(vector))

for i in pos_toVector:
    vector.append(i)
print('pos_toVector:',len(vector))

# ----------------------------------------------------

# Tpattern cases features
for i in Tcases:
    vector.append(i)
print('Tpattern:',len(vector))

# ----------------------------------------------------

# Sentiment Scores features
for i in sentiment_scores:
    vector.append(i)
print('sentiment_scores:',len(vector))

In [None]:
print(vector)

### Saving(dumping)

In [None]:
dataset_fact_file='sav/dataset_fact_reduced.sav'
dataset_nonfact_file='sav/dataset_nonfact_reduced.sav'

X_file='sav/X_reduced.sav'#only sentences->not labels
X_facts_file='sav/X_facts_reduced.sav'
X_nonfacts_file='sav/X_nonfacts_reduced.sav'


joblib.dump(dataset_fact,dataset_fact_file)
joblib.dump(dataset_nonfact,dataset_nonfact_file)
joblib.dump(X,X_file)
joblib.dump(X_facts,X_facts_file)
joblib.dump(X_nonfacts,X_nonfacts_file)


In [None]:
sent_as_string_file='sav/sent_as_string_reduced.sav'
facts_as_string_file='sav/facts_as_string_reduced.sav'
nonfacts_as_string_file='sav/nonfacts_as_string_reduced.sav'

joblib.dump(sent_as_string,sent_as_string_file)
joblib.dump(facts_as_string,facts_as_string_file)
joblib.dump(nonfacts_as_string,nonfacts_as_string_file)

In [None]:
facts_unigram_file='sav/facts_unigram.sav'
facts_bigram_file='sav/facts_bigram.sav'
facts_trigram_file='sav/facts_trigram.sav'
joblib.dump(facts_unigram,facts_unigram_file)
joblib.dump(facts_bigram,facts_bigram_file)
joblib.dump(facts_trigram,facts_trigram_file)

nonfacts_unigram_file='sav/nonfacts_unigram.sav'
nonfacts_bigram_file='sav/nonfacts_bigram.sav'
nonfacts_trigram_file='sav/nonfacts_trigram.sav'
joblib.dump(nonfacts_unigram,nonfacts_unigram_file)
joblib.dump(nonfacts_bigram,nonfacts_bigram_file)
joblib.dump(nonfacts_trigram,nonfacts_trigram_file)

facts_wholeGrams_list_set_file='sav/facts_wholeGrams_list_set_reduced.sav'
nonfacts_wholeGrams_list_set_file='sav/nonfacts_wholeGrams_list_set_reduced.sav'
joblib.dump(facts_wholeGrams_list_set,facts_wholeGrams_list_set_file)
joblib.dump(nonfacts_wholeGrams_list_set,nonfacts_wholeGrams_list_set_file)

In [None]:
pos_tag_list_set_file='sav/pos_tag_list_set_reduced.sav'
joblib.dump(pos_tag_list_set,pos_tag_list_set_file)

In [None]:
named_entity_list_set_file='sav/named_entity_list_set_reduced.sav'
joblib.dump(named_entity_list_set,named_entity_list_set_file)

In [None]:
facts_pos_pattern_3_4_list_set_file='sav/facts_pos_pattern_3_4_list_set_reduced.sav'
nonfacts_pos_pattern_3_4_list_set_file='sav/nonfacts_pos_pattern_3_4_list_set_reduced.sav'

joblib.dump(facts_pos_pattern_3_4_list_set,facts_pos_pattern_3_4_list_set_file)
joblib.dump(nonfacts_pos_pattern_3_4_list_set,nonfacts_pos_pattern_3_4_list_set_file)

In [None]:
vector_file='sav/vector_dimensions_reduced.sav'
joblib.dump(vector,vector_file)

In [None]:
vector_load=joblib.load('vector_dimensions_reduced.sav')

In [None]:
len(vector_load)