In [1]:
import os
from sklearn import preprocessing
import sys
import numpy as np
import pandas as pd
import pickle
import re
#from azureml.dataprep import package


sys.path.append(".")
sys.path.append("..")



from nltk import wordpunct_tokenize
from nltk.corpus import stopwords



def removeString(data, regex):
    return data.str.lower().str.replace(regex.lower(), ' ')



def cleanDataset(dataset, columnsToClean, regexList):
    for column in columnsToClean:
        for regex in regexList:
            dataset[column] = removeString(dataset[column], regex)
    return dataset

stopwords_fileids_custom=['english','french']

def _calc_ratios(text):
    ratios = {}
    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]
#    for lang in stopwords.fileids():
    for lang in stopwords_fileids_custom:
        stopwords_set = set(stopwords.words(lang))
        words_set = set(words)
        common_words = words_set.intersection(stopwords_set)
        ratios[lang] = len(common_words)
    
    is_english=ratios['english']>ratios['french']

    return is_english





In [2]:
def get_Init_RegexList():
    regexList = []
    regexList += ['https?:[^\]\n\r]+']  # https & http
    regexList += ['[\w\d\-\_\.]+@[\w\d\-\_\.]+']  # emails
    regexList += ['[0-9][\-0–90-9 ]+']  # phones
    regexList += ['[0-9]']  # numbers
    # regexList += ['[^a-zA-z 0-9]+']  # anything that is not a letter
    # regexList += ['[\r\n]']  # \r\n
    # regexList += [' [a-zA-Z] ']  # single letters
    # regexList += [' [a-zA-Z][a-zA-Z] ']  # two-letter words
    #regexList += ["  "]  # double spaces

    #regexList += ['^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$']
    #regexList += ['[\w\d\-\_\.]+ @ [\w\d\-\_\.]+']
    #regexList += ['[^a-zA-Z]']
    return regexList

In [3]:
def getRegexList():
    regexList = []
    regexList += ['From:']  # from line
    # regexList += ['RITM[0-9]*'] # request id
    # regexList += ['INC[0-9]*'] # incident id
    # regexList += ['TKT[0-9]*'] # ticket id
    regexList += ['Sent:']  # sent to line
    regexList += ['Received:']  # received data line
    regexList += ['To:','À:']  # to line
    regexList += ['CC:','Cc:']  # cc line
    #regexList += ['The information(.*)infection']  # footer
    #regexList += ['Endava Limited is a company(.*)or omissions']  # footer
    #regexList += ['The information in this email is confidential and may be legally(.*)interference if you are not the intended recipient']  # footer
    regexList += ['\[cid:(.*)]']  # images cid
    regexList += ['https?:[^\]\n\r]+']  # https & http
    regexList += ['Subject:','Objet:','Object:']
    # regexList += ['[\w\d\-\_\.]+@[\w\d\-\_\.]+']  # emails
    # regexList += ['[0-9][\-0–90-9 ]+']  # phones
    # regexList += ['[0-9]']  # numbers
    # regexList += ['[^a-zA-z 0-9]+']  # anything that is not a letter
    # regexList += ['[\r\n]']  # \r\n
    regexList += [' [a-zA-Z] ']  # single letters
    # regexList += [' [a-zA-Z][a-zA-Z] ']  # two-letter words
    regexList += ["  "]  # double spaces

    regexList += ['^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$']
    regexList += ['[\w\d\-\_\.]+ @ [\w\d\-\_\.]+']
    #regexList += ['[^a-zA-Z]']
    # Customized Part
    cust_stopwords=[]
    cust_stopwords+=['Note: Please check the attachments tab for complete email including print screens or attachments']
    cust_stopwords+=['Importance','urgent','Tel','Fax']
    cust_stopwords+=['Descriptif de','incident :','Depuis quand :','Information requises :']
    cust_stopwords+=['Localisation :','Actions réalisées:']
    cust_stopwords+=['imPulse','Ticket','Description','Note']
    
    regexList=regexList+cust_stopwords
    return regexList

In [4]:
##### function-like substitute for Type Alias Python Problem!!!
def cleanText(text, regexList):
    text=text.lower()
    for regex in regexList:
        text=re.sub(regex,'', text)
    return text

def cleanDataset2(dataset, columnsToClean, regexList):
    for column in columnsToClean:
        dataset[column] = dataset[column].apply(lambda x: cleanText(x, regexList)) 
    return dataset

In [5]:
####################
# Load non duplicated ############### Dictionary of text Formats ###################

# Email_like format
regex_Email_like = r"(su|o)?b?jec?t(.*?):(.*?(:|$))"
# Impulse_like format
regex_Impulse_like =r"description =========================================(.*?):(.*?(:|$))"
# Incident_Like format
regex_Incident_like =r"descriptif de l'incident(.*?):(.*?(:|$))"
# Description_Like format
regex_Description_like =r"descript(.*?):(.*?(:|$))"


# Save to format dictionary
Dict_Format={'Email_like':regex_Email_like,'Impulse_like':regex_Impulse_like,'Incident_like':regex_Incident_like,'Description_like':regex_Description_like}
# Array of different text format regex
Array_Format=list(Dict_Format.values())


In [6]:
# calculate format for each text field
def format_type(text):
    for (k,v) in Dict_Format.items():
        if re.search(v, text): 
            return k
    else:
        return 'free' # return free format by default

In [7]:
#### Load dataset from non duplicated file
filepath=r"PGS-IN_non_duplicated.csv"
dfTickets=pd.read_csv(filepath,sep=";",encoding="iso-8859-1")
#####################

In [8]:
# Keep only meaningful fields: Title, Incident Description, Solution Journal Events and Id_Incident(Not meaningful)
# Reorder columns
columnsFilter = ['ID Incident',' Incident Description']



In [45]:
print('dfIncidents shape:',dfTickets.shape)
dfTickets[columnsFilter].head(n=5)
s=1

dfIncidents shape: (20663, 22)


## Remove text with regex

In [44]:
############## First Step of Cleaning ##############
columnsToClean=[' Incident Description']
dfTickets_clean_1=dfTickets.copy()
cleanDataset2(dfTickets_clean_1, columnsToClean,  get_Init_RegexList())
#get_Init_RegexList()
s=1

In [42]:
i=50


In [43]:
i=i+1
text=dfTickets[' Incident Description'][i]
s=1


In [None]:
dfTickets_clean_1[' Incident Description'][i]

In [None]:
dfTickets_clean_2[' Incident Description'][i]

In [None]:
dfTickets_clean_3[' Incident Description'][i]

In [None]:
def get_Custom_RegexList():
    regexList = []
    regexList += ['(su|o)?b?jec?t(.*?):']  # subject,objet
    regexList += ['description =========================================(.*?)']  # impulse ticket description
    regexList += ['impulse','ticket']  # impulse ticket description
    regexList += ["descriptif de l'incident(.*?):"]  # descriptif de l'incident
    regexList += ['depuis quand ']  # Depuis quand 
    regexList += ['further *information *from','=',':','\*','-'] # further information from ===========================
    regexList += ['end *user.*']# kick out end user .......
    regexList += ['information *requises.*'] # kick out information requises .......
    regexList += ['pouvez *vous voir','il vous plait']# Pouvez-vous voir s\'il vous plait
    #regexList += ['^\w'] # symbols
    return regexList

In [None]:
format_type(text)

In [32]:
# Extract a specific text portion according to regex pattern
def extract_regex(text,Dict):
    if format_type(text)!='free':
        return re.search(Dict[format_type(text)], text).group(0)
    else:
        return text


In [38]:
extract_regex(text,Dict_Format)
s=1

In [39]:
dfTickets_clean_2=dfTickets_clean_1.copy()
dfTickets_clean_2[' Incident Description']=dfTickets_clean_2[' Incident Description'].apply(lambda x: extract_regex(x,Dict_Format) )
s=1

In [40]:
dfTickets_clean_3=dfTickets_clean_2.copy()
cleanDataset2(dfTickets_clean_3, columnsToClean,  get_Custom_RegexList())
s=1