In [1]:
import spacy
import corenlp
import nltk
from nltk import ngrams
from nltk.tokenize import sent_tokenize,wordpunct_tokenize,word_tokenize,RegexpTokenizer,TweetTokenizer
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import sklearn
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import os,math,glob,re

from collections import Counter
import re as regex
import contractions
import copy

from itertools import chain

nlp = spacy.load('en_core_web_sm')
nlp.max_length=9999999



### Pre-Processing

In [2]:
def contract(text):
    return contractions.fix(text)

def regTokenize(text):
    tok=RegexpTokenizer('[A-Za-z0-9]*[.]?\w+')
    return tok.tokenize(text) 

def lowercase(text):
    return text.lower()

def lemma(words):
    for i in range(0,len(words)):
        words[i]=WordNetLemmatizer().lemmatize(words[i])
    return words

def stemming(words):
    porter_stemmer=PorterStemmer()
    for i in range(0,len(words)):
        words[i]=porter_stemmer.stem(words[i])
    return words

def tweet(words):
    tok=TweetTokenizer()
    return tok.tokenize(words)

def comma(text):
    text = "".join(c for c in text if c not in ('!','.',':',',','"','?','(',')'))
    return text

def getBasicNorm(text):
    for i in range(0,len(text)):
        text[i]=contract(text[i])
        text[i]=lowercase(text[i])
    return text

### Feature Extraction functions

In [3]:
def getWholeString(data):
    string=''
    for i in data:
        string+=i[0]+' '
    return string

def getNGrams(corpusSentence):
    
    corpusSentence=contract(corpusSentence)
    corpusSentence=lowercase(corpusSentence)
    tokenized=regTokenize(corpusSentence)
    
    uni=list(ngrams(tokenized,1))
    bigram=list(ngrams(tokenized,2))
    trigram=list(ngrams(tokenized,3))

    print('uni:',len(uni),' bi:',len(bigram),'tri:',len(trigram))
    grams=[uni,bigram,trigram]
    
    return grams

def postagging(corpusSentence):
    
    corpusSentence=contract(corpusSentence)
    corpusSentence=lowercase(corpusSentence)
#     tokenized=regTokenize(corpusSentence)
    
    doc = nlp(corpusSentence)
#     for i in range(0,5):
#         print(doc[i].pos_,doc[i].lemma_)
    
    return doc

def posPattern(corpusSentence):
    doc = nlp(corpusSentence)
    return doc


# def namedEntities(corpusSentence):
#     corpusSentence=contract(corpusSentence)
#     corpusSentence=lowercase(corpusSentence)
# #     tokenized=regTokenize(corpusSentence)
    
#     doc = nlp(corpusSentence)
# #     for i in range(0,5):
# #         print(doc[i].pos_,doc[i].lemma_)
#     return doc


# -----------------------------------------POS Pattern-----------------------------------------
def getPosPattern(data_sentences):
    sentences_pos=[]
    for i in data_sentences:
        sent=[]
        s=postagging(i)
        for j in s:
            sent.append(j.pos_)
        sentences_pos.append(sent)
    return sentences_pos

def posGrams(sentence):
    
    trigram=list(ngrams(sentence,3))
    fourgram=list(ngrams(sentence,4))
    
#     print('uni:',len(uni),' bi:',len(bigram),'tri:',len(trigram))
    grams=[trigram,fourgram]
    
    return grams
   
def get34grams(sentences_pos):
    
    three_four_grams=[]
    
    for sent_pos in sentences_pos:
        temp=posGrams(sent_pos)
        temp = list(chain(*temp))
#         print(temp)
        three_four_grams.append(temp)
    
    return three_four_grams

### Dataset Loading

In [4]:
dataset=joblib.load('dataset_with_labels.sav')

print(len(dataset),' sentences')

# print(np.array(dataset)[:,0])
X=np.array(dataset)[:,0] # sentences
Y=np.array(dataset)[:,1] # labels
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

print(len(X_train),' ',len(X_test),' ',len(y_train),' ',len(y_test))

11112  sentences
8889   2223   8889   2223


### getting wholeSentences as one sentence

In [5]:
sent_as_string=getWholeString(dataset)

len(sent_as_string) # sentence length

1635553

### getting 'nGrams' and converting to 'set'

In [6]:
n_grams=getNGrams(sent_as_string)
print(len(n_grams))

wholeGrams = list(chain(*n_grams))
print(len(wholeGrams))

wholeGrams_list_set=list(set(wholeGrams))
print(len(wholeGrams_list_set))

uni: 267545  bi: 267544 tri: 267543
3
802632
363985


### getting pos tags and convert into 'set'

In [7]:
pos=postagging(sent_as_string)

print(len(pos))
pos_tag=[]

for i in pos:
    pos_tag.append(i.pos_)
pos_tag_list_set=list(set(pos_tag))
print(len(pos_tag_list_set))

309474
16


### getting 'named entities' and convert into 'set'

In [20]:
# for i in pos.ents:
#     print(i.text, i.start_char, i.end_char, i.label_)

named_entity=[]
for i in pos.ents:
    named_entity.append(i.label_)

named_entity_list_set=list(set(named_entity))
print(len(named_entity),'-->', len(named_entity_list_set))

15024 --> 17


### getting 'POS Pattern' and getting '3,4-grams' for each sentence and convert into 'set'

In [21]:
wholeX=copy.deepcopy(X)
print(len(wholeX))

wholeX=getBasicNorm(wholeX)

# for i in range(0,10):
#     print(wholeX[i])

11112


In [22]:
posPattern=getPosPattern(wholeX)

In [23]:
for i in posPattern:
    print(i)
    break
print(len(posPattern))

['ADP', 'NUM', 'CCONJ', 'NUM', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'VERB', 'VERB', 'ADP', 'ADJ', 'NOUN', 'ADV', 'PUNCT', 'ADP', 'ADV', 'NUM', 'PART', 'NUM', 'NOUN', 'ADP', 'NOUN', 'VERB', 'PUNCT', 'VERB', 'ADP', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN', 'PUNCT']
11112


In [24]:
pos_pattern_3_4=get34grams(posPattern)
print(len(pos_pattern_3_4))

# for i in range(0,2):
#     print(pos_pattern_3_4[i])
#     print('\n\n')

pos_pattern_3_4_list=list(chain(*pos_pattern_3_4))
print(len(pos_pattern_3_4_list))

pos_pattern_3_4_list_set=list(set(pos_pattern_3_4_list))
print(len(pos_pattern_3_4_list_set))

11112
563466
13514


### Sentiment Extractor

In [25]:
sentiment_scores=['very negative','negative','neutral','positive','very positive']

# scores from [0 to 4]
# 'very negative'=0,'negative'=1,'neutral'=2,'positive'=3,'very positive'=4


### TPattern

In [26]:
# case1= ()
# case2= ()
# case3= ()
# case4= ()
# case5= ()

### ASPattern

### Adding into 'Vector'

In [27]:
vector=[]

# Ngrams features
for i in wholeGrams_list_set:
    vector.append(i)
print('wholeGrams_list_set:',len(vector))

# pos Tagging features
for i in pos_tag_list_set:
    vector.append(i)
print('pos_tag_list_set:',len(vector))

# Named Entities features
for i in named_entity_list_set:
    vector.append(i)
print('named_entity_list_set:',len(vector))

# posPattern features
for i in pos_pattern_3_4_list_set:
    vector.append(i)
print('pos_pattern_3_4_list_set:',len(vector))

# Tpattern cases features
for i in range(1,6):
    cases='case'+str(i)
    vector.append(cases)
print('Tpattern:',len(vector))

# Sentiment Scores features
for i in sentiment_scores:
    vector.append(i)
print('sentiment_scores:',len(vector))

wholeGrams_list_set: 363985
pos_tag_list_set: 364001
named_entity_list_set: 364018
pos_pattern_3_4_list_set: 377532
Tpattern: 377537
sentiment_scores: 377542


In [28]:
vector_file='vector_dimensions.sav'
joblib.dump(vector,vector_file)

['vector_dimensions.sav']

In [29]:
vector_load=joblib.load('vector_dimensions.sav')

In [30]:
len(vector_load)

377542

In [31]:
for i in range(0,10):
    print(vector_load[i])

('be', 'held', 'within')
('a', 'praetorian')
('considered', 'prisoners')
('the', 'existing', 'basis')
('cell', 'vehicle', 'the')
('received', 'a', 'lot')
('and', 'olusegun', 'obasanjo')
('speed', 'why', 'do')
('holds', 'true', 'for')
('to', 'fulfill', 'these')


## Feature Extraction

### Ngrams

### pos Tagging

### Named Entities

### pos Pattern

### TPattern

### Sentiment Scores

### ASpattern