In [4]:
import os
from sklearn import preprocessing
import sys
import numpy as np
import pandas as pd
import pickle
#from azureml.dataprep import package


sys.path.append(".")
sys.path.append("..")



from nltk import wordpunct_tokenize
from nltk.corpus import stopwords



def removeString(data, regex):
    return data.str.lower().str.replace(regex.lower(), ' ')


def cleanDataset(dataset, columnsToClean, regexList):
    for column in columnsToClean:
        for regex in regexList:
            dataset[column] = removeString(dataset[column], regex)
    return dataset

stopwords_fileids_custom=['english','french']

def _calc_ratios(text):
    ratios = {}
    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]
#    for lang in stopwords.fileids():
    for lang in stopwords_fileids_custom:
        stopwords_set = set(stopwords.words(lang))
        words_set = set(words)
        common_words = words_set.intersection(stopwords_set)
        ratios[lang] = len(common_words)
    
    is_english=ratios['english']>ratios['french']

    return is_english





In [5]:
def getRegexList():
    regexList = []
    regexList += ['From:']  # from line
    # regexList += ['RITM[0-9]*'] # request id
    # regexList += ['INC[0-9]*'] # incident id
    # regexList += ['TKT[0-9]*'] # ticket id
    regexList += ['Sent:']  # sent to line
    regexList += ['Received:']  # received data line
    regexList += ['To:','À:']  # to line
    regexList += ['CC:','Cc:']  # cc line
    #regexList += ['The information(.*)infection']  # footer
    #regexList += ['Endava Limited is a company(.*)or omissions']  # footer
    #regexList += ['The information in this email is confidential and may be legally(.*)interference if you are not the intended recipient']  # footer
    regexList += ['\[cid:(.*)]']  # images cid
    regexList += ['https?:[^\]\n\r]+']  # https & http
    regexList += ['Subject:','Objet:','Object:']
    # regexList += ['[\w\d\-\_\.]+@[\w\d\-\_\.]+']  # emails
    # regexList += ['[0-9][\-0–90-9 ]+']  # phones
    # regexList += ['[0-9]']  # numbers
    # regexList += ['[^a-zA-z 0-9]+']  # anything that is not a letter
    # regexList += ['[\r\n]']  # \r\n
    regexList += [' [a-zA-Z] ']  # single letters
    # regexList += [' [a-zA-Z][a-zA-Z] ']  # two-letter words
    regexList += ["  "]  # double spaces

    regexList += ['^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$']
    regexList += ['[\w\d\-\_\.]+ @ [\w\d\-\_\.]+']
    #regexList += ['[^a-zA-Z]']
    # Customized Part
    cust_stopwords=[]
    cust_stopwords+=['Note: Please check the attachments tab for complete email including print screens or attachments']
    cust_stopwords+=['Importance','urgent','Tel','Fax']
    cust_stopwords+=['Descriptif de','incident :','Depuis quand :','Information requises :']
    cust_stopwords+=['Localisation :','Actions réalisées:']
    cust_stopwords+=['imPulse','Ticket','Description','Note']
    
    regexList=regexList+cust_stopwords
    return regexList

In [6]:
####################
# Load dataset from file
filepath=r"C:\Mission_SG\Fork\Text-Classification-Project-master\PGS-IN_test.csv"
dfIncidents=pd.read_csv(filepath,sep=";",encoding="iso-8859-1")
#####################


In [7]:
# Keep only meaningful fields: Title, Incident Description, Solution Journal Events and Id_Incident(Not meaningful)
# Reorder columns
columnsFilter = ['ID','Title','SUMMARY','Solution']

dfIncidents = dfIncidents[columnsFilter]

In [8]:
print('dfIncidents shape:',dfIncidents.shape)
dfIncidents.head(n=5)

dfIncidents shape: (20009, 4)


Unnamed: 0,ID,Title,SUMMARY,Solution
0,P7IN-0600504,RE: urgent P4 escalated - acces application co...,*********** *** Note: Please check the attachm...,Out of scope
1,P7IN-0875012,{ A22/345 / Reparamétrage / Laptop / 1 / } / ...,----------------------------------------------...,{ A22/345 / Reparamétrage / Laptop / 1 / }\tR...
2,P7IN-1005399,[SUMO] Add PolicyHub intranet in all @Access S...,***********imPulse Ticket Description ========...,
3,P7IN-1005399,[SUMO] Add PolicyHub intranet in all @Access S...,***********imPulse Ticket Description ========...,
4,P7IN-1122690,"Malgré changement de poste, GAIA nok\\ { A16FL...",*** Note: Please check the attachments tab fo...,Création d'un nouveau profil sur la machine et...


In [20]:
dfTickets=dfIncidents

In [31]:
# Remove duplicates
columnsToDropDuplicates = ['SUMMARY']
dfTickets = dfTickets.drop_duplicates(columnsToDropDuplicates)
print(dfTickets.shape)

(15844, 4)


# Saving Simplified csv

In [32]:
dfTickets.to_csv("PGS-IN_simplified.csv",sep=';',index=False)

## Remove text with regex

In [21]:
# Select columns for cleaning
columnsToClean = ['SUMMARY', 'Title']



# Create list of regex to remove sensitive data
# Clean dataset and remove sensitive data
cleanDataset(dfTickets, columnsToClean, getRegexList())


Unnamed: 0,ID,IN - Incident Type,IN - Critical User,IN - 001 - Contact Entity,IN - Status,IN - Service,IN - Category,IN - Sub-Category,IN - Characterisation,Title,...,IN - Open Time,IN - Close Date,IN - Assignment Group,IN - EUS - Follow Group,IN - Open By Group,IN - Restored By Group,IN - Previous Assignment Group,IN - Activity Description,IN - Activity Date,IN - Clock Group
0,P7IN-0600504,incident,Silver,CORI/COV/FRA/COS,Closed,EUS_APPLICATIONS,eus_applications,business_applications,create,re: p4 escalated - acces application contact,...,16/05/2017 13:55:20,25/01/2018 15:06:58,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_35000.REQUEST_L1,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.SILVER_L2,24/01/2018 16:42:27,GTS_PAR_EUS_35000.SILVER_L2
1,P7IN-0875012,incident,Silver,DFIN/DTO/TRF,Closed,EUS_COMPUTERS,eus_computers,peripheral_devices,fix,{ a22/345 / reparamétrage / laptop / 1 / } / [...,...,19/07/2017 14:40:53,29/03/2018 19:41:16,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_OSS.DEF,GTS_PAR_EUS_35000.SILVER_L2,29/03/2018 19:40:53,GTS_PAR_EUS_35000.SILVER_L2
2,P7IN-1005399,request,Silver,RESG/GTS/DIR/EXP/USA,Assigned,EUS_COMPUTERS,eus_computers,virtual_desktops,,[sumo] add policyhub intranet in all @access s...,...,24/08/2017 01:33:53,01/01/1900 00:00:00,GTS_PAR_EUS_IPM.ISM.OPE,,GTS_PAR_EUS_35000_L1,,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.SILVER_L2,23/03/2018 09:23:23,GTS_PAR_EUS_35000.SILVER_L2
3,P7IN-1005399,request,Silver,RESG/GTS/DIR/EXP/USA,Assigned,EUS_COMPUTERS,eus_computers,virtual_desktops,,[sumo] add policyhub intranet in all @access s...,...,24/08/2017 01:33:53,01/01/1900 00:00:00,GTS_PAR_EUS_IPM.ISM.OPE,,GTS_PAR_EUS_35000_L1,,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.SILVER_L2,23/03/2018 16:32:01,GTS_PAR_EUS_35000.SILVER_L2
4,P7IN-1122690,incident,Silver,OPER/CMA/RVS,Closed,EUS_APPLICATIONS,eus_applications,business_applications,fix,"malgré changement de poste, gaia nok\\ { a16fl...",...,22/09/2017 16:16:21,09/05/2018 17:20:05,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.SILVER_L2,22/01/2018 17:58:20,GTS_PAR_EUS_35000.SILVER_L2
5,P7IN-1130192,incident,Silver,ASSU/SGK,Closed,IT_TELEPHONY,it_telephony,ip_telephony,inc-minor,[sogecap]-incident de fonctionnement click dial,...,25/09/2017 11:49:09,20/04/2018 16:02:39,GTS_FR_EUS_PRX.OSS.ORLEAN_L3,,GTS_FR_EUS_PRX.SVP_L1,GTS_FR_EUS_PRX.OSS.ORLEAN_L3,GTS_PAR_EUS_35000.TOIP_L1,GTS_PAR_EUS_35000.SILVER_L2,17/01/2018 17:52:31,GTS_PAR_EUS_35000.SILVER_L2
6,P7IN-1139110,request,Gold,GLFI/COO/ISP,Closed,EUS_APPLICATIONS,eus_applications,business_applications,,[mkdadmin] deletion licence of capital in ligence,...,26/09/2017 17:56:38,18/01/2018 08:35:47,GTS_PAR_EUS_35000.GOLD_L2,GTS_PAR_EUS_35000.GOLD_L2,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.GOLD_L2,gts_par_mkt_mkd_mda,GTS_PAR_EUS_35000.GOLD_L2,18/01/2018 08:34:57,GTS_PAR_EUS_35000.SILVER_L2
7,P7IN-1152638,incident,Silver,MCIB/MNA/STG,Closed,EUS_APPLICATIONS,itsm_interface,itsm_impulse,,[#][oscar] demande d'accès,...,29/09/2017 14:50:46,22/12/2017 13:57:46,GTS_PAR_EUS_35000.GOLD_L2,GTS_PAR_EUS_35000.GOLD_L2,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.GOLD_L2,ITEC_SGS_SBO.TIPS_SUPPORT_L1,GTS_PAR_EUS_35000.GOLD_L2,22/12/2017 13:57:20,GTS_PAR_EUS_35000.SILVER_L2
8,P7IN-1153634,request,Silver,CORI/CCG/CRE/FIG,Closed,EUS_APPLICATIONS,eus_applications,business_applications,fix,[market data] guibert,...,29/09/2017 17:54:04,12/12/2017 08:33:13,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.SILVER_L2,gts_par_mkt_mkd_mda,GTS_PAR_EUS_35000.SILVER_L2,12/12/2017 08:30:47,GTS_PAR_EUS_35000.SILVER_L2
9,P7IN-1163776,incident,Silver,OPER/QTY,Closed,EUS_COMPUTERS,eus_computers,desktops,fix,[freeze clavier,...,03/10/2017 10:15:17,05/12/2017 09:58:30,GTS_PAR_EUS_35000.SILVER_L2,,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.SILVER_L2,GTS_PAR_EUS_35000_L1,GTS_PAR_EUS_35000.SILVER_L2,04/12/2017 16:07:22,GTS_PAR_EUS_35000.SILVER_L2


## Save dataset and removed words to csv

In [12]:
dfTickets_clean=dfTickets

In [13]:
# Save cleaned dataset back to csv without indexes
dfTickets.to_csv('tickets_preprocessed.csv', index=False, index_label=False, sep=";",encoding="iso-8859-1")

PermissionError: [Errno 13] Permission denied: 'tickets_preprocessed.csv'

In [34]:
import re

In [9]:
i=55

In [63]:
i=i+1
text=dfIncidents['SUMMARY'][i]
text


'*********** *** Note: Please check the attachments tab for complete email including print screens or attachments ***     From: JABIN Germain ItecDccCsc  Sent: Thursday, November 09, 2017 3:17 PM To: Call-Center 35000 Subject: Installation Visual Studio       Bonjour,     Pourriez-vous installer Microsoft Visual Studio 2013 sur la VDI « FR09544735W» ?  ·         CA = 29589  ·         Code TBS = S39636 Cdt     **********************'

In [68]:
regex = r"Subject?:.*(:?\w)"
matches = re.finditer(regex, text, re.MULTILINE)
text_parse=re.findall(regex, text, re.MULTILINE)[0][0]
print(text_parse)
regex_stop=r'Tel.?:'
re.sub(regex_stop,'',text_parse)

t


't'

In [100]:
regex = r"b?jec?t?:.*(:?\w)"
m=re.search(regex, text)
text_parse=m.group(0)
print(text_parse)
regex_stop=r'Tel.?:'
re.sub(regex_stop,'',text_parse)

bject: Installation Visual Studio       Bonjour,     Pourriez-vous installer Microsoft Visual Studio 2013 sur la VDI « FR09544735W» ?  ·         CA = 29589  ·         Code TBS = S39636 Cdt


'bject: Installation Visual Studio       Bonjour,     Pourriez-vous installer Microsoft Visual Studio 2013 sur la VDI « FR09544735W» ?  ·         CA = 29589  ·         Code TBS = S39636 Cdt'

In [103]:
re.finditer(regex, text)

<callable_iterator at 0x1f3e3e02cf8>

In [111]:
m=re.search(regex, 'salut')
bool(m)
m.group(0)

AttributeError: 'NoneType' object has no attribute 'group'

In [91]:
text_parse.lower().replace(r'c?dt', ' ')

'subject: installation visual studio       bonjour,     pourriez-vous installer microsoft visual studio 2013 sur la vdi « fr09544735w» ?  ·         ca = 29589  ·         code tbs = s39636 cdt'

In [None]:
m = re.search('(?<=abc)def', 'abcdef')
m.group(0)

In [80]:
[matches].start(1)

AttributeError: 'list' object has no attribute 'start'

In [69]:
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility

import re

regex = r"Subject?:.*(:?\w)"

test_str = "*********** *** Note: Please check the attachments tab for complete email including print screens or attachments ***     From: JABIN Germain ItecDccCsc  Sent: Thursday, November 09, 2017 3:17 PM To: Call-Center 35000 Subject: Installation Visual Studio       Bonjour,     Pourriez-vous installer Microsoft Visual Studio 2013 sur la VDI « FR09544735W» ?  ·         CA = 29589  ·         Code TBS = S39636 Cdt  **********************"

matches = re.finditer(regex, test_str, re.MULTILINE)

for matchNum, match in enumerate(matches):
    matchNum = matchNum + 1
    
    print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
    
    for groupNum in range(0, len(match.groups())):
        groupNum = groupNum + 1
        
        print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))

# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.


Match 1 was found at 217-407: Subject: Installation Visual Studio       Bonjour,     Pourriez-vous installer Microsoft Visual Studio 2013 sur la VDI « FR09544735W» ?  ·         CA = 29589  ·         Code TBS = S39636 Cdt
Group 1 found at 406-407: t


In [31]:
dfTickets_clean['SUMMARY'][0]

'                   please check the attachments tab for complete email including print screens or attachments        guibert remi coricovfracos  tuesday  may               am  call center        re    p  escalated   acces application contact    high   bonjour   pouvez vous svp proc der aux m j pr alables informatiques ci dessous     si besoin merci de vous rapprocher de marie josee rei pour les pr cisions techniques   merci svp de m informer de l avanc e de ce   p       escalad   la demande est  e  merci  bien cordialement  r mi       remi guibert      coverage and investment banking   americas   asian groups                                 remi guibert sgcib com         hotline manageriale bddfcooqsahlm  tuesday    may             guibert remi coricovfracos  hotline manageriale bddfcooqsahlm  re      feedback    contact application access p        bddf coo   hotline manageriale     assistance manageriale       c    publication interne          bonjour monsieur guibert   ci dessous le

In [92]:
df=dfIncidents['SUMMARY']

In [93]:
df.head()

0    *********** *** Note: Please check the attachm...
1    ----------------------------------------------...
4     *** Note: Please check the attachments tab fo...
Name: SUMMARY, dtype: object

In [96]:
def lowing(x):
    return x.lower()
  

In [97]:
df1=df.apply(lowing)

In [98]:
df1.head()

0    *********** *** note: please check the attachm...
1    ----------------------------------------------...
4     *** note: please check the attachments tab fo...
Name: SUMMARY, dtype: object

In [115]:
def filtering(text):
    # Note Class
    regex_Note_Class = r"b?jec?t?:.*(:?\w)"
    m=re.search(regex_Note_Class, text)
    if bool(m):
        text_parse_Note_Class=m.group(0)
        regex_stop_Note_Class=r'Tel.?:'
        return re.sub(regex_stop_Note_Class,'',text_parse_Note_Class)

In [118]:
df1.apply(filtering)[4]

'bjet: re: p7it-0337755 submitted - fw: incidents pc : appsondemand et acrobat reader bonjour karl, sauf erreur, je suis tjrs en attente d\x92une réponse de votre part. est-ce que vous avez pu avancer sur le sujet? merci d\x92avance pour votre aide sur le sujet.  bien cordialement / best regards, ludivine dubois oper/rvs   from: dubois ludivine operrvs  sent: monday 18 september 2017 11:07 to: querville karl (ext) resggtseususugbi subject: re: p7it-0337755 submitted - fw: incidents pc : appsondemand et acrobat reader bonjour karl, je vous confirme que mon application gaia ne fonctione toujours pas. est-ce que vous pourriez revoir? merci d\x92avance pour votre aide.  bien cordialement / best regards, ludivine dubois oper/rvs   from: querville karl (ext) resggtseususugbi  sent: thursday 11 may 2017 14:23 to: dubois ludivine operrvs subject: re: p7it-0337755 submitted - fw: incidents pc : appsondemand et acrobat reader bonjour mme perrou, j\x92attends un retour de la part des gestionnaire