In [27]:
import pandas as pd
df = pd.read_excel(r"C:\Users\GIS\Desktop\wps_download\combined_emails_with_natural_pii_.xlsx") 
emails = df['email'].tolist()
labels = df['type'].tolist()

In [2]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def preprocess_email(email):
    email = re.sub(r'\W', ' ', email)  
    email = email.lower()               
    email = ' '.join(word for word in email.split() if word not in stopwords.words('english')) 
    return email


df['cleaned_email'] = df['email'].apply(preprocess_email)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GIS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def mask_pii(text):
    
    patterns = {
        r'[A-Z][a-z]+\s[A-Z][a-z]+': '[full_name]',   
        r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}': '[email]',  
        r'\+?\d{1,3}[-. ]?\d{1,4}[-. ]?\d{3}[-. ]?\d{4}': '[phone_number]',  
        r'(\d{4}\s?\d{4}\s?\d{4})':'[aadhar_num]',
        r'\d{2}/\d{2}/\d{4}': '[dob]',  
        r'\d{4}-\d{4}-\d{4}-\d{4}': '[credit_debit_no]',  
        r'\b\d{3}\b': '[cvv_no]',  
        r'\d{2}/\d{2}': '[expiry_no]' 
    }
    
    for pattern, mask in patterns.items():
        text = re.sub(pattern, mask, text)  
        
    return text


df['masked_email'] = df['cleaned_email'].apply(mask_pii)

In [4]:
df.head()

Unnamed: 0,email,type,cleaned_email,masked_email
0,Subject: Unvorhergesehener Absturz der Datenan...,Incident,subject unvorhergesehener absturz der datenana...,subject unvorhergesehener absturz der datenana...
1,Subject: Customer Support Inquiry\n\nSeeking i...,Request,subject customer support inquiry seeking infor...,subject customer support inquiry seeking infor...
2,Subject: Data Analytics for Investment\n\nI am...,Request,subject data analytics investment contacting r...,subject data analytics investment contacting r...
3,Subject: Krankenhaus-Dienstleistung-Problem\n\...,Incident,subject krankenhaus dienstleistung problem ein...,subject krankenhaus dienstleistung problem ein...
4,"Subject: Security\n\nDear Customer Support, I ...",Request,subject security dear customer support reachin...,subject security dear customer support reachin...


In [11]:
def process_email(email_content):
    masked_content = mask_pii(email_content)
    classification = model_ngram.predict([masked_content])[0]
    return {
        "masked_email": masked_content,
        "classification": classification
    }


In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline


X_train, X_test, y_train, y_test = train_test_split(df['masked_email'], labels, test_size=0.2)


model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

In [6]:
from sklearn.metrics import accuracy_score

predictions = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, predictions))

Accuracy:  0.7425


In [7]:
accuracy_score(y_train,model.predict(X_train))

0.818125

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
model_ngram = make_pipeline(CountVectorizer(ngram_range=(1, 2)), MultinomialNB())
model_ngram.fit(X_train, y_train)

# Make predictions
y_pred_ngram = model_ngram.predict(X_test)

# Evaluate
accuracy_ngram = accuracy_score(y_test, y_pred_ngram)
print(f"Accuracy of CountVectorizer (with n-grams) + MultinomialNB model: {accuracy_ngram:.4f}")

print("\nClassification Report (N-grams):")
print(classification_report(y_test, y_pred_ngram))

Accuracy of CountVectorizer (with n-grams) + MultinomialNB model: 0.7465

Classification Report (N-grams):
              precision    recall  f1-score   support

      Change       0.98      0.58      0.73       510
    Incident       0.66      0.96      0.78      1891
     Problem       0.69      0.20      0.31      1016
     Request       0.88      0.92      0.90      1383

    accuracy                           0.75      4800
   macro avg       0.80      0.67      0.68      4800
weighted avg       0.76      0.75      0.71      4800



In [9]:
best_model=model_ngram

In [12]:
process_email("Subject: Hilfe bei Verbindungsproblemen mit dem Drucker benÃ¶tigtKunde <name> berichtet, dass der Epson EcoTank ET-4760 sich nicht mit dem Netzwerk verbinden lÃ¤sst You can reach me at liuwei@business.cn.. Bitte um Anleitung fÃ¼r drahtlose Setup-Diagnosen, um das Problem schnell zu lÃ¶sen. Bitte helfen Sie bald. My name is David Kim.")

{'masked_email': 'Subject: Hilfe bei Verbindungsproblemen mit dem Drucker benÃ¶tigtKunde <name> berichtet, dass der [full_name]Tank ET-4760 sich nicht mit dem Netzwerk verbinden lÃ¤sst You can reach me at [email].. Bitte um Anleitung fÃ¼r drahtlose Setup-Diagnosen, um das Problem schnell zu lÃ¶sen. Bitte helfen Sie bald. My name is [full_name].',
 'classification': 'Incident'}

In [13]:
process_email("Subject: Urgente: Resolver el problema de latencia del Servicio de GestiÃ³n de AWEstimado cliente de Servicios de TI,Estamos experimentando un incidente grave con el Servicio de GestiÃ³n de AWS, especÃ­ficamente relacionado con la escalabilidad de nuestra infraestructura, lo cual estÃ¡ causando una alta latencia en nuestras aplicaciones desplegadas. Esto impacta severamente el rendimiento y requiere atenciÃ³n y resoluciÃ³n inmediata My name is Carlos Mendoza.. Por favor, prioricen este problema y proporcionen una actualizaciÃ³n sobre el progreso de la resoluciÃ³n lo antes posible.Gracias por su rÃ¡pida respuesta.Atentamente,<name><tel_num<acc_num> You can reach me at janesmith@company.com.")

{'masked_email': 'Subject: Urgente: Resolver el problema de latencia del Servicio de GestiÃ³n de AWEstimado cliente de Servicios de TI,Estamos experimentando un incidente grave con el Servicio de GestiÃ³n de AWS, especÃ\xadficamente relacionado con la escalabilidad de nuestra infraestructura, lo cual estÃ¡ causando una alta latencia en nuestras aplicaciones desplegadas. Esto impacta severamente el rendimiento y requiere atenciÃ³n y resoluciÃ³n inmediata My name is [full_name].. Por favor, prioricen este problema y proporcionen una actualizaciÃ³n sobre el progreso de la resoluciÃ³n lo antes posible.Gracias por su rÃ¡pida respuesta.Atentamente,<name><tel_num<acc_num> You can reach me at [email].',
 'classification': 'Incident'}

In [14]:
process_email("Subject: Request for Assistance with Data IntegrationWe are encountering data integration problems while optimizing investments using analytics tools. These issues might be due to unexpected software compatibility problems between various applications. Despite updating the software and checking for configuration errors, the problem continues My name is Sophia Rossi.. We would greatly appreciate your guidance in resolving this matter to ensure a smooth data flow and accurate analysis You can reach me at omar.hassan@secure.net.. Please inform us of the next steps to address this issue.")

{'masked_email': 'Subject: Request for Assistance with [full_name]We are encountering data integration problems while optimizing investments using analytics tools. These issues might be due to unexpected software compatibility problems between various applications. Despite updating the software and checking for configuration errors, the problem continues My name is [full_name].. We would greatly appreciate your guidance in resolving this matter to ensure a smooth data flow and accurate analysis You can reach me at [email].. Please inform us of the next steps to address this issue.',
 'classification': 'Incident'}

In [16]:
predictions = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, predictions))

Accuracy:  0.7425


In [15]:
y_pred_ngram = model_ngram.predict(X_test)
print('accuracy:',accuracy_score(y_test,y_pred_ngram))

accuracy: 0.7464583333333333


In [17]:
best_model=model_ngram

In [18]:
def process_email(email_content):
    masked_content = mask_pii(email_content)
    classification = model_ngram.predict([masked_content])[0]
    return {
        "masked_email": masked_content,
        "classification": classification
    }



In [19]:
process_email("Subject: Notwendige Technische UnterstÃ¼tzungSehr geehrte Kundensupport, wir begegnen momentan unregelmÃ¤ÃŸigen Ladezeiten bei unserer Projektmanagement-Plattform. MÃ¶glicherweise ist dies auf erhÃ¶hte Nutzungsdaten zurÃ¼ckzufÃ¼hren My name is Elena Ivanova.. Wir haben bereits den Cache geleert, die Serverleistung Ã¼berprÃ¼ft und die Anwendungsdiener neu gestartet. Trotz unserer Anstrengungen besteht das Problem weiterhin. Wir schÃ¤tzen Ihre UnterstÃ¼tzung, um bald eine LÃ¶sung anzubieten You can reach me at sophia.rossi@service.it.. Bitte geben Sie uns, falls nÃ¶tig, weitere Informationen zur VerfÃ¼gung, um das Problem zu lÃ¶sen.")

{'masked_email': 'Subject: [full_name] UnterstÃ¼tzungSehr geehrte Kundensupport, wir begegnen momentan unregelmÃ¤ÃŸigen Ladezeiten bei unserer Projektmanagement-Plattform. MÃ¶glicherweise ist dies auf erhÃ¶hte Nutzungsdaten zurÃ¼ckzufÃ¼hren My name is [full_name].. Wir haben bereits den Cache geleert, die Serverleistung Ã¼berprÃ¼ft und die Anwendungsdiener neu gestartet. Trotz unserer Anstrengungen besteht das Problem weiterhin. Wir schÃ¤tzen [full_name]Ã¼tzung, um bald eine LÃ¶sung anzubieten You can reach me at [email].. Bitte geben Sie uns, falls nÃ¶tig, weitere Informationen zur VerfÃ¼gung, um das Problem zu lÃ¶sen.',
 'classification': 'Problem'}

In [20]:
process_email("Subject: Incident of Unauthorised Access to Hospital SystemThere was an unauthorized access attempt on the hospital system, which could have exposed patient information due to potential vulnerabilities in the outdated security protocols and software You can reach me at sophia.rossi@service.it.. Measures such as immediate system scans and password changes have been put in place to address the issue. The matter is being investigated, and appropriate steps are being taken to prevent future occurrences My name is Liu Wei.. Ensuring the security of patient data is our top priority, and we will provide updates as more information becomes available.")

{'masked_email': 'Subject: Incident of [full_name] to [full_name]There was an unauthorized access attempt on the hospital system, which could have exposed patient information due to potential vulnerabilities in the outdated security protocols and software You can reach me at [email].. Measures such as immediate system scans and password changes have been put in place to address the issue. The matter is being investigated, and appropriate steps are being taken to prevent future occurrences My name is [full_name].. Ensuring the security of patient data is our top priority, and we will provide updates as more information becomes available.',
 'classification': 'Incident'}

In [21]:
process_email("Subject: Concern Regarding BillingI am compiling a report on a billing discrepancy involving various tech products. The issue might have arisen due to recent subscription renewals You can reach me at fatima.farsi@help.com.. Despite my attempts to verify the transactions and contacting support, the matter remains unresolved. I would greatly appreciate if you could look into this and provide a resolution as soon as possible My name is Jane Smith.. Please let me know if there is any additional information you need to facilitate the process."
)

{'masked_email': 'Subject: [full_name] BillingI am compiling a report on a billing discrepancy involving various tech products. The issue might have arisen due to recent subscription renewals You can reach me at [email].. Despite my attempts to verify the transactions and contacting support, the matter remains unresolved. I would greatly appreciate if you could look into this and provide a resolution as soon as possible My name is [full_name].. Please let me know if there is any additional information you need to facilitate the process.',
 'classification': 'Incident'}

In [22]:
process_email("Subject: Problem mit der HP DeskJet 3755 WLAN-VerbindungSehr geehrter Tech Online Store Support, mein HP DeskJet 3755 verbindet sich nicht drahtlos mit meinem Laptop. KÃ¶nnten Sie mir bitte helfen, dieses Problem zu lÃ¶sen? Vielen Dank You can reach me at maria.gonzalez@shop.es.. Mit freundlichen GrÃ¼ÃŸen, <name>. My name is Sophia Rossi."
)

{'masked_email': 'Subject: Problem mit der HP DeskJet 3755 WLAN-VerbindungSehr geehrter [full_name] [full_name], mein HP DeskJet 3755 verbindet sich nicht drahtlos mit meinem Laptop. KÃ¶nnten Sie mir bitte helfen, dieses Problem zu lÃ¶sen? [full_name] You can reach me at [email].. Mit freundlichen GrÃ¼ÃŸen, <name>. My name is [full_name].',
 'classification': 'Problem'}

In [23]:
process_email("Subject: Immediate Assistance Needed: Troubled Cisco RouterDear Customer Support TeamOur Cisco Router ISR4331 is encountering a sudden malfunction, resulting in significant network disruptions throughout our system. This matter is urgent and requires your prompt technical support My name is Sophia Rossi.. We believe that updating the routerâ€™s firmware could bring it back to its usual operating condition. Please accelerate the troubleshooting process to fix these disruptions and reinstate our services at your earliest opportunity You can reach me at johndoe@email.com.. Continuous, uninterrupted service is crucial for us, and we greatly value your quick help.Best regards,<name<tel_num>")

{'masked_email': 'Subject: [full_name] Needed: [full_name] Router[full_name] [full_name][full_name] Router ISR4331 is encountering a sudden malfunction, resulting in significant network disruptions throughout our system. This matter is urgent and requires your prompt technical support My name is [full_name].. We believe that updating the routerâ€™s firmware could bring it back to its usual operating condition. Please accelerate the troubleshooting process to fix these disruptions and reinstate our services at your earliest opportunity You can reach me at [email].. Continuous, uninterrupted service is crucial for us, and we greatly value your quick help.Best regards,<name<tel_num>',
 'classification': 'Incident'}

In [24]:
process_email("Subject: To Whom It May Concern, I am contacting you to report a possible data breach, which may be related to outdated software You can reach me at david.kim@corp.kr.. Despite my attempts to update various programs, including Snagit, Ansible, and Firebase, the problem continues. I am worried about the security of my data and would appreciate your help in addressing this issue promptly. Could you advise on the next steps or suggest a solution to prevent future breaches? I value your swift response to this matter and eagerly await your guidance. Please inform me if you require any further information to assist me. I can be reached at your convenience to discuss this further. Thank you for your support in resolving this issue. Please provide any necessary reference number <ref_num> for my records, and I look forward to your prompt follow-up. My name is Fatima Al-Farsi.")

{'masked_email': 'Subject: [full_name] [full_name] Concern, I am contacting you to report a possible data breach, which may be related to outdated software You can reach me at [email].. Despite my attempts to update various programs, including Snagit, Ansible, and Firebase, the problem continues. I am worried about the security of my data and would appreciate your help in addressing this issue promptly. Could you advise on the next steps or suggest a solution to prevent future breaches? I value your swift response to this matter and eagerly await your guidance. Please inform me if you require any further information to assist me. I can be reached at your convenience to discuss this further. Thank you for your support in resolving this issue. Please provide any necessary reference number <ref_num> for my records, and I look forward to your prompt follow-up. My name is [full_name]-Farsi.',
 'classification': 'Incident'}

In [25]:
process_email("Subject: La aplicaciÃ³n se vuelve no receptiva mientras se exporta.. My contact number is +82-2-3456-7890.")

{'masked_email': 'Subject: La aplicaciÃ³n se vuelve no receptiva mientras se exporta.. My contact number is +82-[phone_number].',
 'classification': 'Incident'}

In [26]:
process_email("Subject: Rechnungsproblem bei der VerlÃ¤ngerung von Google WorkspaceSehr geehrter Kundenservice,ich schreibe, um eine Rechnungsproblematik bezÃ¼glich meines Google Workspace Business Standard Abonnements anzufechten. Die VerlÃ¤ngerungsgebÃ¼hren stimmen nicht mit dem vereinbarten Servicezeitraum Ã¼berein You can reach me at elena.ivanova@support.org.. Bitte Ã¼berprÃ¼fen Sie mein Konto (<acc_num>) und erklÃ¤ren Sie mir die fehlerhaften GebÃ¼hren fÃ¼r die kÃ¼rzliche VerlÃ¤ngerung. Ihre zeitnahe Aufmerksamkeit fÃ¼r dieses Anliegen wÃ¤re sehr geschÃ¤tzt.Vielen Dank.Mit freundlichen GrÃ¼ÃŸen<name><tel_num> My name is Sophia Rossi.")

{'masked_email': 'Subject: Rechnungsproblem bei der VerlÃ¤ngerung von [full_name]Sehr geehrter Kundenservice,ich schreibe, um eine Rechnungsproblematik bezÃ¼glich meines [full_name] [full_name] Abonnements anzufechten. [full_name]Ã¤ngerungsgebÃ¼hren stimmen nicht mit dem vereinbarten Servicezeitraum Ã¼berein You can reach me at [email].. Bitte Ã¼berprÃ¼fen Sie mein Konto (<acc_num>) und erklÃ¤ren Sie mir die fehlerhaften GebÃ¼hren fÃ¼r die kÃ¼rzliche VerlÃ¤ngerung. Ihre zeitnahe Aufmerksamkeit fÃ¼r dieses Anliegen wÃ¤re sehr geschÃ¤tzt.[full_name].Mit freundlichen GrÃ¼ÃŸen<name><tel_num> My name is [full_name].',
 'classification': 'Problem'}

In [28]:
process_email("Subject: Seeking detailed information on the features of the SaaS project management platform, particularly task management, team collaboration, and reporting capabilities My name is Liu Wei.. Also, interested in knowing the integration options with various tools and services You can reach me at fatima.farsi@help.com.. This information will assist in determining if the platform is a good fit for our needs.")

{'masked_email': 'Subject: Seeking detailed information on the features of the SaaS project management platform, particularly task management, team collaboration, and reporting capabilities My name is [full_name].. Also, interested in knowing the integration options with various tools and services You can reach me at [email].. This information will assist in determining if the platform is a good fit for our needs.',
 'classification': 'Request'}