# Loading required packages

In [1]:
import pandas as pd
import re

import matplotlib.pyplot as plt
%matplotlib inline

from nltk.corpus import stopwords
from nltk import wordpunct_tokenize

#from langdetect import detect

# Cleaning routines

In [2]:
# removing a regex-formatted string from data text
def removeString(data, regex):
    return data.str.lower().str.replace(regex.lower(), ' ')

# DataFrame version of removeString applied on columns columnsToClean of the dataset
#: !!!! does'nt function for the current Python version!!!!!!!!!
def cleanDataset(dataset, columnsToClean, regexList):
    for column in columnsToClean:
        for regex in regexList:
            dataset[column] = removeString(dataset[column], regex)
    return dataset

##### function-like substitute for Type Alias Python Problem!!!
def cleanText(text, regexList):
    text=text.lower()
    for regex in regexList:
        text=re.sub(regex,'', text)
    return text

# Substituting cleanDataset function
def cleanDataset2(dataset, columnsToClean, regexList):
    for column in columnsToClean:
        dataset[column] = dataset[column].apply(lambda x: cleanText(x, regexList)) 
    return dataset


# Regex patterns for special strings to remove: phone number, email, http links
def get_spec_RegexList():
    regexList = []
    regexList += ['https?:[^\]\n\r]+']  # https & http
    regexList += ['[\w\d\-\_\.]+@[\w\d\-\_\.]+']  # emails
    regexList += ['[0-9][\-0–90-9 ]+']  # phones
    regexList += ['^(?:(?:\+|00)33[\s.-]{0,3}(?:\(0\)[\s.-]{0,3})?|0)[1-9](?:(?:[\s.-]?\d{2}){4}|\d{2}(?:[\s.-]?\d{3}){2})$'] # french phones
    regexList += ['[0-9]']  # numbers
    
    
    
    #regexList += ['[^a-zA-z 0-9]+']  # anything that is not a letter
    # regexList += ['[\r\n]']  # \r\n
    # regexList += [' [a-zA-Z] ']  # single letters
    # regexList += [' [a-zA-Z][a-zA-Z] ']  # two-letter words
    #regexList += ["  "]  # double spaces

    #regexList += ['^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$']
    #regexList += ['[\w\d\-\_\.]+ @ [\w\d\-\_\.]+']
    #regexList += ['[^a-zA-Z]']
    return regexList

# Loading Raw Data

In [3]:
#### Load dataset from non duplicated file
#filepath="PGS-IN.csv"
dfTickets=pd.read_csv(filepath,sep=";",encoding="iso-8859-1")
print('DataFrame shape:',dfTickets.shape)
#####################

NameError: name 'filepath' is not defined

# Processing of 'Incident description' formats

In [None]:
####################
#  ############### Dictionary of text Formats ###################

# Email_like format
#regex_Email_like = r"\* note: please check the attachments tab for complete "
regex_Email_like =r'(objet :|subject:)'
# Impulse_like format
regex_Impulse_like =r"ticket description =*"
# Incident_Like format
regex_Incident_like =r"descriptif de l\'incident :"
# Detail_Incident_Like_Format
regex_Detail_Incident_Like=r"détail de l\'incident ou de la demande:"
# Description_Like format
#regex_Description_like =r"descript(.*?):(.*?(:|$))"
#regex_Description_like =r"sdescript(.*?):(.*?(:|$))"
# Save to format dictionary
#Dict_Format={'Email_like':regex_Email_like,'Impulse_like':regex_Impulse_like,'Incident_like':regex_Incident_like,'Description_like':regex_Description_like}
Dict_Format={'Email_like':regex_Email_like,'Impulse_like':regex_Impulse_like,'Incident_like':regex_Incident_like,'Detail_Incident_Like':regex_Detail_Incident_Like}



# Array of different text format regex
Array_Format=list(Dict_Format.values())


In [None]:
# calculate format (recorded in Dict_Format) for each text field
def format_type(text,Dict_Format):
    text_format='free' # return free format by default
    for (k,v) in Dict_Format.items():
        if bool(re.search(v, text)):
            text_format=k
            break
    return text_format

In [None]:
#### Adding new field for text format
dfTickets_format=dfTickets.copy()
dfTickets_format['Incident_Description_format']=dfTickets_format['IN - Incident Description'].str.lower().apply(lambda x: format_type(x,Dict_Format))

In [None]:
dfTickets_format['IN - Incident Description'].head()

In [None]:
dfTickets_format[['Incident_Description_format','IN - Incident Description']]

In [None]:
##### Histogram of processed formats  ###################
format_count=dfTickets_format.groupby('Incident_Description_format')['ID Incident'].count()
print(format_count)
format_count.plot(kind='bar',figsize=(6,4))

# Parsing of 'Incident description'  field

In [None]:
######### Dictionary of main key regex (significative field for each format) for parsing############

Dict_regex_main_parse={'Email_like':r'(objet :|subject:)','Impulse_like':r'ticket description =*','Incident_like':r'descriptif de l\'incident :','Detail_Incident_Like':r'détail de l\'incident ou de la demande:'}


######### Dictionary of Format Parsing ############
regex_main_parse={'Email_like':r'(objet :|subject:)','Impulse_like':r'ticket description =*','Incident_like':r'descriptif de l\'incident :'}
# pat for Incident_like format
pat_Incident=r'(descriptif de l\'incident :|actions réalisées:|depuis quand :|message d\'erreur|information requises :|nom du matérie l:|localisation :|téléphone \(complet\) :|os :)'
# pat for Impulse_like format
pat_Impulse=r'(ticket description =*|further information from impulse=* end user:|caller:|contact type:|impact:|urgency:|impulse owning group:|impulse category level 1:|impulse category level 2:|impulse category level 3:| created by:|created_at :|impacted_environment :|end user\'s location :)'
# pat for E-mail_like format
pat_Email=r'(objet :|subject:|note:|de :|from:|envoyé :|sent:|à :|to:|cc :|phone: &#43;|fax : &#43;|email:|message d\'origine-----|tél.       &#43;|e-mail :)'
# pat for Detail_Incident_Like format
pat_Detail_Incident=r'(détail de l\'incident ou de la demande:|ma du poste :|manipulations déjà effectuées:)'

Dict_format_parsing={'Email_like':pat_Email,'Impulse_like':pat_Impulse,'Incident_like':pat_Incident,'Detail_Incident_Like':pat_Detail_Incident}

In [None]:
# Extract the main key to be sporsed
def extract_key(regex,dict_parse):
        return re.search(regex, "".join(list(dict_parse.keys()))).group(0)

In [None]:
# Finally parse the portion of text (corresponding to the extracted main key)
def parse_text(text,Dict_Format,Dict_format_parsing,Dict_regex_main_parse):
#    print(text)
    format_text=format_type(text,Dict_Format)
    if format_text!='free':
        Dict_parse=dict(zip(*[iter(re.split(Dict_format_parsing[format_text], text, re.MULTILINE)[1:])]*2))
        return Dict_parse[extract_key(Dict_regex_main_parse[format_text], Dict_parse)]
    else:
        return text # do nothing for free-formatted text

In [None]:
i=1000

In [None]:
i=i+1
text=dfTickets_format['IN - Incident Description'][i].lower()
text

In [None]:
parse_text(text,Dict_Format,Dict_format_parsing,Dict_regex_main_parse)

In [None]:
Dict_format_parsing['Detail_Incident_Like']
#dict(zip(*[iter(re.split(Dict_format_parsing['Detail_Incident_Like'], text, re.MULTILINE)[1:])]*2))

In [None]:
Dict_parse=dict(zip(*[iter(re.split(Dict_format_parsing[format_text], text, re.MULTILINE)[1:])]*2))
Dict_parse

In [None]:
# Adding new field 'Parsed_Incident_Description'
dfTickets_parse=dfTickets_format.copy()
dfTickets_parse['Parsed_Incident_Description']=dfTickets_parse['IN - Incident Description'].str.lower().apply(lambda x: parse_text(x,Dict_Format,Dict_format_parsing,Dict_regex_main_parse) )

In [None]:
dfTickets_parse[['IN - Title','IN - Incident Description','Parsed_Incident_Description','Incident_Description_format']].sample(30)

# First round cleaning of text fields

In [None]:
############## First Step of Cleaning ##############
# Cleaning text fields of DF: dfTickets_parse, the fields are mentioned in columnsToClean list  
columnsToClean=['Parsed_Incident_Description','IN - Title']
dfTickets_parse_clean_1=dfTickets_parse.copy()
cleanDataset2(dfTickets_parse_clean_1, columnsToClean,  get_spec_RegexList())
# Display cleaned fields
dfTickets_parse_clean_1[['IN - Title','IN - Incident Description','Parsed_Incident_Description','Incident_Description_format']].sample(30)


In [None]:
dfTickets_parse_clean_1['Parsed_Incident_Description'][5221]

# Detecting language of Parsed Incident Description field

In [None]:
# function for detecting english or french text based on frequency of common stop words for each language. May be not efficient for short texts!!

def detect_english_french(text):
    stopwords_fileids_custom=['english','french']
    ratios = {}
    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]
#    for lang in stopwords.fileids():
    for lang in stopwords_fileids_custom:
        stopwords_set = set(stopwords.words(lang))
        words_set = set(words)
        common_words = words_set.intersection(stopwords_set)
        ratios[lang] = len(common_words)
    
    if ratios['english']>ratios['french']:
        lang_text='english'
    else:
        lang_text='french'       

    return lang_text

# driver to mix detect function from langdetect package and  detect_english_french function
def detect_lang(text):
    print('--------\n')
    print(text)
    
    if len(text)>0 and len(text)<6:
        result=detect(text)
    else:
        result=detect_english_french(text)
    return result

In [None]:
# Adding new field 'Detected_language' to cleaned DataFrame
#dfTickets_parse_clean_1['Detected_language']=dfTickets_parse_clean_1['Parsed_Incident_Description'].apply(detect_lang)
dfTickets_parse_clean_1['Detected_language']=dfTickets_parse_clean_1['Parsed_Incident_Description'].apply(detect_english_french)

In [None]:
# Display new field
dfTickets_parse_clean_1[['IN - Title','Parsed_Incident_Description','Detected_language']].sample(30)

In [None]:
##### Histogram of detected languages among english or french  ###################
format_count=dfTickets_parse_clean_1.groupby('Detected_language')['ID Incident'].count()
print(format_count)
format_count.plot(kind='bar',figsize=(6,4))

In [None]:
pwd

In [None]:
dfTickets_parse_clean_1['Parsed_Incident_Description'][8673]

In [None]:
text='bonjour les amis'

In [None]:
detect_english_french(text)

In [None]:
detect(text)