In [2]:
# !pip freeze > requirements.txt

In [1]:
from functions import *
import pandas as pd
import datetime
import json
import email
import os
import spacy
import re
import matplotlib.pyplot as plt
import unidecode

In [None]:
""" ============= IMPORTANT =================
    This cell should run in Python2.x because its using
    a library emaildata which produces error on Python3
    ==========================================
"""

# import email
# from emaildata.text import Text
# get_payload_func = lambda file_name: Text.text(email.message_from_file(open(file_name)))

# folder = './final_messages/'

# data = pd.DataFrame(columns = ['payload'])
# for f in os.listdir(folder):
#     file_path = folder+f
#     data.loc[f] = get_payload_func(file_path)



In [3]:
# data.to_pickle('./payloads.pkl')
data = pd.read_pickle('./payloads.pkl')

In [9]:
def remove_headers(payload):
    """  This function perform data cleansing (header removal) on emails"""
    
    all_sents = re.split('[\\n+|\\r+]',payload)
    garbage_headers = ['sent:','to:','from:','cc:','-original message-','subject:']
    cleaned_sents = []
    for s in all_sents:
        s = s.strip()
        if s=='':
            continue
        elif len(re.findall('|'.join(garbage_headers), s.lower())) > 0:
            continue
        cleaned_sents.append(s)
    return ' \n '.join(cleaned_sents)

In [11]:
def get_spacy_tokens(sent):
    """ This function returns a sentence dependency tree in pandas dataframe form """
    
    if sent.strip() == '':
        return pd.DataFrame(columns=['text','tag','pos','dep','parent', 'parent_pos'])
    
    struct = []
    for token in nlp(sent):
        struct.append([token.text, token.tag_ , token.pos_, token.dep_, token.head.text, token.head.pos_])
    df = pd.DataFrame(struct, columns=['text','tag','pos','dep','parent', 'parent_pos'])
    df.index = df.text.values
    return df

def is_sentence_meaningful(sentence):
    """ Check weather the given sentence is meaningful or not by looking at the verb-noun involved """
    
    dep_parser = get_spacy_tokens(sentence)
    nouns = dep_parser[dep_parser.pos=='NOUN']
    if nouns[nouns['parent_pos']=='VERB'].shape[0]:
        return True #Meaningful Sentence
    return False #Garbage Sentence



In [13]:
#filter sentences
filter_invalid_sentences = lambda all_sents:  [s for s in all_sents if is_sentence_meaningful(s)]

#Sentence tokenization function
split_into_sentences = lambda payload: [re.sub('\\n+|\\s+', ' ' ,sent.text).strip() for sent in nlp(payload).sents]


In [14]:
data.loc[:,'payload_sents'] = data.payload.map(remove_headers).map(split_into_sentences).map(filter_invalid_sentences)


In [15]:
data.to_pickle('./payloads.pkl')

# Feature Building

In [16]:
import flair

#Pre-trained Sentiment Analyzer model
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

def get_sentence_sentiment(sent):
    """ Check whether the sentence is positive or negative in nature """
    
    s = flair.data.Sentence(sent)
    flair_sentiment.predict(s)
    total_sentiment = s.labels
    return total_sentiment

2020-06-10 10:34:00,581 loading file /home/cloud_user/.flair/models/sentiment-en-mix-distillbert.pt


In [17]:
# Words that usually form compliance related discussions
word_list = ['compliance','comply','accordance','breach']

In [18]:
data.loc[:,'compliance_sents'] = data.payload_sents.map(lambda x: [s for s in x if len(re.findall('|'.join(word_list), s)) > 0 ])
data.loc[:,'compliance_sentiments'] = data.compliance_sents.map(lambda sents: [(get_sentence_sentiment(s)) for s in sents])
data.compliance_sentiments = data.compliance_sentiments.map(lambda x: [x[0][0].value for a in x])

In [19]:
compliance_mention_dataset = data[data.compliance_sents.map(len)>0]
compliance_not_mention_dataset = data[data.compliance_sents.map(len)==0]

In [20]:
compliance_mention_dataset.to_pickle('./compliance_mention_dataset.pkl')
compliance_not_mention_dataset.to_pickle('./compliance_not_mention_dataset.pkl')

In [21]:
dataset = []
for idx, val in compliance_mention_dataset.iterrows():
    result =  (list(zip(val['compliance_sents'], val['compliance_sentiments'])))
    dataset.extend(result)
#     break

In [22]:
dataset = pd.DataFrame(np.asarray(dataset), columns=['sentences','label'])
dataset.label = dataset.label.map(lambda x: -1 if 'NEGATIVE' in x else 1)

In [23]:
""" Remove non-alphanumeric characters from our dataset """
dataset.sentences = dataset.sentences.map(lambda x: ' '.join(re.findall('\w+',x)) )

In [24]:
dataset.to_pickle('./compliance_labeled_data.pkl')