### Read the dataset

import field

In [1]:
import pandas as pd
import numpy as np

#### Dataset ("Appel Contactel") redifine

In [2]:
#load the excel sheet
df = pd.read_excel("../data/data.xlsx")

#create new dataframe and select the need columns from the excel
data = pd.DataFrame()
data["ticket-label"] = ""
data[["ticket-content", "ticket-owner"]] = df[["Description", "Traitement - Équipe"]]
data["ticket-label"].fillna("Not_defined", inplace=True)
print(data.head(10))

  ticket-label                                     ticket-content  \
0  Not_defined  Elle n arrive pas à allumer son ordinateur à d...   
1  Not_defined  APPELANT\n    Client : SEFRECO\n    Nom : BEYE...   
2  Not_defined  APPELANT\n     Client : ASSOCIATION MEDICALE I...   
3  Not_defined  Bonjour,\n          \n            Appel concer...   
4  Not_defined  Bonjour,\n    \n   Appel concernant un problèm...   
5  Not_defined  Bonjour,\n      \n   Mr MIGEOT a appelé cet ap...   
6  Not_defined  APPELANT\n    Client : SEFRECO\n        \n    ...   
7  Not_defined  APPELANT\n     Client : LA COMPAGNIE DU SAV \n...   
8  Not_defined  APPELANT\n       Client : SEFRECO\n       Nom ...   
9  Not_defined   \n   \n APPELANT\n   \n Client : SEFRECO\n   ...   

   ticket-owner  
0     1-SUPPORT  
1  2-SUPPORT N2  
2     1-SUPPORT  
3     1-SUPPORT  
4  2-SUPPORT N2  
5  2-SUPPORT N2  
6  2-SUPPORT N2  
7  2-SUPPORT N2  
8     1-SUPPORT  
9  2-SUPPORT N2  


#### Clean the dataset["ticket-content"]

In [3]:
import spacy
import re
import string
import inflect
from collections import Counter
import unicodedata

In [4]:
nlp = spacy.load("fr_core_news_md")
stopwords = nlp.Defaults.stop_words
print(stopwords, len(stopwords))

{'miennes', 'assez', 'avons', 'hi', 'faisaient', 'suivants', "n'", 'différente', 'lesquels', 'voici', 'ha', 'meme', 'précisement', 'hors', 'premièrement', 'laisser', 'soi-même', 'm’', 'façon', 'suffit', 'vais', 'dix-sept', 'delà', 'devers', 'touchant', 'fais', 'notre', 'specifique', 'plusieurs', 'devra', 'juste', 'avais', 'mienne', 'treize', 'parler', 'lorsque', 'font', 'revoici', 'suivre', 'aura', 'spécifique', 'quant-à-soi', 'quinze', 'divers', 'antérieures', 'aupres', 'dont', 'moindres', 'à', 'longtemps', 'seize', 'derrière', 'peut', 'â', 'rend', 'nul', 'quand', 'ça', 'directe', 'avaient', "c'", 'o', 'dejà', 'eux', 'plutôt', 'ceux', 'deuxième', 'chacun', 'bat', 'différents', 'ouverts', 'effet', 'étaient', 'seule', 'sienne', 'toi-même', 'sous', 'tend', 'moi-même', 'personne', 'quatre-vingt', 'cinquième', 'quant', 'peu', 'ont', 'deux', 'tout', 'étant', 'antérieur', 'vas', 'desquelles', 'nouveau', 'entre', 'un', 'aie', 'suit', 'tien', 'sur', 'seraient', 't’', 'onze', 'celle', 'quoi', '

In [125]:
word = "que"
if word in stopwords:
    print(word)


que


##### Text preprocessiong functions

In [133]:
#lowercase function:
def lowercase(text):
    return(text.lower())

#remove number:
def remove_number(text):
    result = re.sub(r"\d+", "", text)
    return result

#convert number to text:
def number_to_text(text):
    p = inflect.engine()
    tempr = text.split()
    new_str = []
    for word in tempr:
        if word.isdigit():
            new_str.append(p.number_to_words(word))
        else:
            new_str.append(word)
    return " ".join(new_str)

#remove punctuation:
def remove_punctuation(text):
    result = text.translate(str.maketrans("","", string.punctuation))
    return result

#remove whitespace:
def remove_whitespace(text):
    return " ".join(text.split())

#remove stopword:
def remove_stopword(text):
    doc = nlp(text)
    new_str = []
    for token in doc:
        if token.text not in stopwords:
            new_str.append(token.text)
    return " ".join(new_str)

#lemmatization
def lemmatization(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        tokens.append(token.lemma_)
    return tokens

#Pos
def part_of_speech(text):
    doc = nlp(text)
    pos = []
    for token in doc:
        pos.append(token.pos_)
    return pos

#NER
def name_entities_reco(text):
    doc = nlp(text)
    ents = [ent.label_ for ent in list(doc.ents)]
    return ents

#get the vocabulary and remove some common values
def get_vocabs(text):
    vocabs = list(set(lemmatization(remove_whitespace(remove_punctuation(text)))))
    commons = {'problème', "support", "équipe",  "appeler", "urgent", "urgente"
               "appel", "technique", "cordialement", "bonjour", "appelant", "heure",
               "client", "message", "ticket", "demande", "cours", "oui", "non", 
               }
    for common in commons:
        if common in vocabs:
            vocabs.remove(common)
    return vocabs

#remove client details:
def remove_client_detail(text):
    result = re.sub(r"client\W:.+", "", text, re.IGNORECASE)
    result = re.sub(r"nom\W.+", "", result, re.IGNORECASE)
    result = re.sub(r"(tel|tél).\W+\d+.+", "", result, re.IGNORECASE)
    result = re.sub(r"contrat.\W+.\d.+", "", result, re.IGNORECASE)
    result = re.sub(r"n.+.ticket\W+.\d.+", "", result, re.IGNORECASE)
    return result
    
#remove accent:
def strip_accent(text):
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != "Mn")

#preprocessing
def text_preprocessing(text):
    text = remove_stopword(remove_client_detail(text.lower()))
    text = remove_whitespace(remove_commons(remove_punctuation(text)))
    text = lemmatization(remove_number(text))
    return " ".join(text)

#remove commons words
def remove_commons(text):
    temp = list(set(lemmatization(text)))
    new_str = []
    commons = {'problème', "support", "équipe",  "appeler", "urgent", "urgente",
               "appel", "technique", "cordialement", "bonjour", "appelant", "heure",
               "client", "message", "ticket", "demande", "cours", "oui", "non", 
               }
    for txt in temp:
        if txt not in commons:
            new_str.append(txt)
    return " ".join(new_str)

In [120]:
#def get the similarity table
def cosinesimilarity(a,b):
    return np.dot(a,b)/np.sqrt(a.dot(a)*b.dot(b))

def find_similarity(reference, texts):
    doc = nlp(reference)
    doc_vector = doc.vector
    doc_similarity = []
    text_docs = [nlp(text) for text in texts]
    text_vectors = [txt.vector for txt in text_docs]
    vec_mean = np.array(text_vectors).mean(axis=0)
    centered = doc_vector - vec_mean
    for idx, text_vec in enumerate(text_vectors):
        doc_similarity.append((idx, 
                               cosinesimilarity(text_vec - vec_mean, 
                                                centered)))
    return doc_similarity

In [134]:
text = text_preprocessing(data["ticket-content"].iloc[1])
texts = [text_preprocessing(txt) for txt in data["ticket-content"].values]
similarity = find_similarity(text, texts)


In [135]:
similarity.sort(key=lambda x:x[1], reverse=True)
print(similarity[:48])

[(1, 1.0), (232, 0.60959584), (230, 0.43202293), (131, 0.39362422), (10, 0.3887379), (194, 0.34976435), (315, 0.32132047), (130, 0.3180608), (450, 0.30532607), (283, 0.2889058), (359, 0.23192109), (233, 0.22994743), (227, 0.22655982), (471, 0.22062579), (32, 0.21623531), (203, 0.21421933), (58, 0.21346518), (306, 0.2128298), (184, 0.21206528), (216, 0.21129969), (257, 0.20647795), (456, 0.19879891), (278, 0.19789284), (314, 0.19627705), (22, 0.19415954), (190, 0.19398019), (316, 0.19355226), (238, 0.18668239), (111, 0.18056732), (443, 0.17792359), (31, 0.1773072), (223, 0.17639096), (341, 0.17261513), (208, 0.17060411), (23, 0.16964887), (358, 0.16795674), (329, 0.16382751), (89, 0.16347761), (90, 0.16115086), (209, 0.1583166), (141, 0.15646741), (77, 0.15602441), (195, 0.15345939), (69, 0.15287603), (338, 0.14733559), (4, 0.14677386), (243, 0.14541799), (274, 0.14446346)]


In [136]:
print(texts[1])
print(texts[96])

forme connexion plat
dysfonctionne vpn Monsieur connexion


In [137]:
text_232 = data["ticket-content"].iloc[96]
print(text_232)

Bonjour.
             
             Nom : khairallah Roy Tél. : 06.73.53.74.51
    Contrat :     24h/24 - 7j/7  N° ticket : 14361682
     Message     : Ticket en cours: Non Demande
    urgente: Non      Monsieur appelle car il a un problème
    de connexion avec sa     message Son VPN dysfonctionne    
         Cordialement,L'équipe support..


In [218]:
text = data["ticket-content"].iloc[1]
vocabs = get_vocabs(remove_number(remove_whitespace(remove_punctuation(remove_stopword(remove_client_detail(text.lower()))))))
print(vocabs)

['plat', 'forme', 'connexion']


In [126]:
texts = data["ticket-content"].values
vocabs = []
for text in texts:
    vocabs.extend(get_vocabs(text_preprocessing(text)))
print(Counter(vocabs).most_common())

[('transmettre', 391), ('gestioncontactelfr', 390), ('descriptif', 390), ('gestion', 390), ('origin', 389), ('body', 387), ('cababcabcdgroupehisifr', 386), ('e-mail', 384), ('tentative', 278), ('cest', 248), ('site', 121), ('présent', 119), ('base', 111), ('tentativ', 108), ('donner', 101), ('souhaite', 79), ('mail', 52), ('connecter', 47), ('efi', 44), ('arrive', 42), ('n', 42), ('Monsieur', 42), ('electricfil', 41), ('rappeler', 38), ('e', 38), ('d', 36), ('j', 36), ('passer', 36), ('bloquer', 36), ('compte', 34), ('connexion', 33), ('mot', 33), ('serveur', 32), ('joindre', 32), ('impossible', 31), ('accès', 31), ('ordinateur', 28), ('contrat', 28), ('fonctionne', 27), ('besoin', 27), ('advisory', 27), ('dc', 27), ('sidaction', 27), ('logiciel', 25), ('être', 24), ('faire', 23), ('pc', 22), ('boite', 22), ('sefreco', 22), ('sujet', 21), ('envoyer', 20), ('recevoir', 20), ('cerba', 20), ('donnée', 20), ('fenwick', 20), ('cabinet', 18), ('vpn', 18), ('mettre', 18), ('suite', 18), ('ema

In [221]:
print(len(vocabs))

1206


In [234]:
commons = {'problème', "support", "équipe",  "appeler", "urgent",
               "appel", "technique", "cordialement", "bonjour"}
for word_common in list(commons):
    ms = nlp.vocab.vectors.most_similar(np.asarray([nlp.vocab.vectors[nlp.vocab.strings[word_common]]]), n=3)
    words = [nlp.vocab.strings[w] for w in ms[0][0]]
    print(words)

['karin', 'onjour', 'Alaykoum']
['Kappeler', 'sou\xadve\xadnir', 'Rulli']
['èquipe', 'URAP', 'Deogratias']
['capage', 'Socles', 'dosable']
['urge', 'urgentissime', 'injonctif']
['Lappel', 'l"\'Appel', 'dappels']
['prblème', 'sous-problèmes', 'Probléme']
['technique|', 'nanotechnologique', 'Psychotechnique']
['.Cordialement', 'particuli?rement', 'talentueusement']
