In [1]:
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import warnings
warnings.filterwarnings('ignore')
# create english words
words = set(nltk.corpus.words.words())

In [2]:
# os.listdir('drive/MyDrive')
# read the data
data = pd.read_csv('drive/MyDrive/dataset.csv' , encoding = "ISO-8859-1" , engine='python')

In [3]:
# check the shape of the data
data.shape

(154169, 5)

In [4]:
# few samples
data.head()

Unnamed: 0,Subject,TextBody,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,???: My flight Nov 11/17 AA125 DFW-HKG,"Hi,\n\nI haven't received your reply yet, can ...",,,
1,Re: ??: My flight Nov 11/17 AA125 DFW-HKG [...,Pls cancel it for me and I will use it later. ...,,,
2,"RE: Tuesday, 05Feb, 2019: Ticketed itinerary f...",I could not book my hotel because you all did ...,,,
3,"RE: Tuesday, 05Feb, 2019: Ticketed itinerary f...","Hi Renee,\nI ended up booking the hotel on Exp...",,,
4,"RE: Tuesday, 05Feb, 2019: Ticketed itinerary f...",Wow! Thanks for the great customer service an...,,,


In [5]:
# check nulls
data.isna().sum()

Subject        79101
TextBody       79132
Unnamed: 2    154168
Unnamed: 3    154168
Unnamed: 4    154168
dtype: int64

In [6]:
# drop row wise null vales
data1 = data.copy()

In [7]:
data = data[['Subject' , "TextBody"]].dropna(axis = 0)

In [8]:
data.isna().sum()

Subject     0
TextBody    0
dtype: int64

In [9]:
# sample text body
data.iloc[0]['TextBody']

"Hi,\n\nI haven't received your reply yet, can you check and let me know? The case # is 00870466.\n\nThank you!\n\n________________________________\nYihua Huang (Summer), Sales Manager\nDUNAN PRECISION INC.\nCell (USA): +1-214-909-7983\nCell (China): +86-138-2557-4218\nEmail (USA): summer.huang@dunanusa.com\nEmail (China): summer@dunan.cn\nURL (USA): www.dunanusa.com\nURL (China): www.dunan.net\nAddress: 12840 Hillcrest Road #E230, Dallas, TX 75230\n\n\x86\x8f?\x84¯?\x84§§\x8b¬? Summer Huang \x82¯?\x91§\x9b\x86\x8d?<mailto:summer.huang@dunanUSA.com>\n\x86\x8f?\x82?\x81\x91??\x82??\x8b¬? 2018-10-26 09:57\n\x91??\x84¯?\x84§§\x8b¬? travelsupport<mailto:travelsupport@ganttravel.com>\n\x84?¯\x82\x9b?\x8b¬? My flight Nov 11/17 AA125 DFW-HKG\nHi,\n\nCan you check the remaining value of the ticket if I cancel my flight on Nov 11/17 AA125 DFW-HKG (Confirmation: DPYFKF)?\n\nThank you!\n\n________________________________\nYihua Huang (Summer), Sales Manager\nDUNAN PRECISION INC.\nCell (USA): +1-2

In [10]:
# this function tries to extract email from a text
def getEmail(text):
  emails =  []
  pattern = r'[\w\.]+@[\w\.]+\.[\w]+'
  #further cleaning will be done to remove duplicates
  emails.append(re.findall(pattern , data.iloc[0]['TextBody']))
  return emails

# test with head
getEmail(data.head(45))

[['summer.huang@dunanusa.com',
  'summer@dunan.cn',
  'summer.huang@dunanUSA.com',
  'travelsupport@ganttravel.com',
  'summer.huang@dunanusa.com',
  'summer@dunan.cn']]

In [11]:
# take a sample of data to use for cleaning
# df = data.sample(30 , random_state=42)
# This will help us while we are trying to clean the data
df = data.head(30)

In [12]:
data.shape

(74938, 2)

In [13]:
# pattern to match phones numbers
phone_pattern = r'\d*\s\d+\s\d+\s\d+'
#pattern to match any email  in a text
email_pattern = r'[\w\.]+@[\w\.]+\.[\w]+'
#this is for matching urls..
# i scrambled trying it out.. i had to write the long one to match every type of url
url_pattern = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

import string
writable = set(string.printable)

# all functions takes text which is the sentiment  to be processed
# the all also return the processed text
def removeNonAsciiChars(text):
  """
  This function removes any non characters in english  like the hex encodings e.g \x86\x8d etc
  """
  return ''.join(filter(lambda x: x in writable, text))



def GetEnglishWords(text):
  """This function is used to get all English words found in NLTK CORPUS"""
  sent = text
  text = " ".join(w for w in nltk.wordpunct_tokenize(sent) if w.lower() in words or not w.isalpha())
  return text

def remove_keywords(text):
  """Returns a text which have removed the following keywords used in emails"""
  text = text.replace("Subject" , "")
  text = text.replace("Subject" , "")
  text = text.replace("Re:" , "")
  text = text.replace("Subject" , "")
  return text


# this is the main function that does cleaning
def text_cleaning(text):
    """
    I decided to subsitute all # signs with Number
    I sliced part of text from where emails are starting...Like after a messages the Emails replied to
    I removed all emails found in the text
    I removed all urls found in the text
    Removed more than two spaces or undescores
    """
    #subsitute # with number
    text = re.sub("#" , "Number "  , text)
    

    #slice the text to where emails are starting
    email_match = re.findall(email_pattern , text)
    if len(email_match)>0:
      text = text.partition(email_match[0])[0]

    #slice the text until it find "Sent From"  text if doesnt found it return text
    sender_end = re.findall("Sent from" , text)
    if len(sender_end)>0:
      text = text.partition(sender_end[0])[0]

    #remove emails from the text
    text = re.sub(email_pattern , "" , text)
    #remove urls
    #remove any urls found in the text
    text = re.sub(url_pattern , "" ,  text)
    #remove more than two spaces
    text = re.sub("\s\s*" , " " , text)
    #remove underscore with this pattern 
    text = re.sub('________________________________' , "" , text)

    return text

# remove phone number
def removePhone(text):
  """Removes phone numbers from the text , This is from the most common patter found in text"""
  return re.sub(phone_pattern , "" , text)


# get to remove some puctuations
def getRidPunct(text):
  """
  Removes some punctions predefined below
  I predefined some punctuations since some are import in the email
  """
  punctuations = '''()-[]{};:'"\,<>./?@#$%§+^&*_~'''
  # define a var to hold all no punctuated
  no_punct = ""
  for char in text:
    if char not in punctuations:
        no_punct = no_punct + char

  return no_punct

# get the most  important text
def partText(text):
  """
  In this function I defined some regex that are used to extract some part of the word
  In this regex , I identified that after an email , It is followed by more that 4 words
  which starts with upper case  so the pattern is [A-Z]\w+\s* and it repearts 4 or 5 times followed by a digit
  """
  caps_matcher = r"\s+[A-Z]\w+\s* [A-Z]\w+\s* [A-Z]\w+\s* [A-Z]\w+\s* [A-Z]\w+\s* [\d]\w+\s*"
  matching = re.findall(caps_matcher , text)
  # if there is matching of the above regex i return the first one else i return the original text
  if len(matching)>0:
    return text.partition(matching[0])[0] + " " + "".join(matching).split()[1]
  else:
    return text


def correctWords(text):
  """Using Textblob library , To correct spellings of the english words found"""
  import textblob
  text = textblob.TextBlob(text).correct()    
  return text 


In [14]:
# # 
def mainCaller(text):  
  """
  This is the main Function for the program
  You just need to call it using apply method to a series
  e.g  Series.apply(function_name)
  If you are calling it with a string just call it normally like function_name(text = string)
  The function returns a string.
  """
  text = removeNonAsciiChars(text)
  text = remove_keywords(text)
  text = text_cleaning(text)
  text = getRidPunct(text)
  text = removePhone(text)
  text = partText(text)
  text = GetEnglishWords(text)
  text = correctWords(text)
  return text

In [15]:
# this is for the sample that we created. for the test
df['cleaned'] = df['TextBody'].apply(mainCaller)

In [16]:
for each in df['cleaned'].head():
  print(each)
  print("\n\n")

I I haven received your reply yet can you check and let me know The case Number is 00870466 Thank you ! Summer Manager PRECISION



cancel it for me and I will use it later Thanks ! Outlook for Travel Support



I could not book my hotel because you all did not have it Western Premier I stayed there 2 ago your its not listed Please call me to discuss Rank Rank



I I ended up booking the hotel on I stay at this hotel all the selected it on my previous trip in Concur I spent over 30 trying to find it on your system last night before giving up Please escapade the have someone tell me what Key or I should use in the future so as not to have this issue I in All as the area tried all for West Western Used key West Western Premier in the hotel actual address And still nothing came up for Premier on Rank



Now ! Thanks for the great customer service and more importantly now I know its not me going insane ! ! I use this hotel somehow got it on concur for my previous again for the quick reacti

In [None]:
# create a new columns to store our cleaned text
data['cleaned'] = data['TextBody'].apply(mainCaller)

In [17]:
# NOTE -- This cell was used for testing ... 
# I have created an function that does that automatically


# df['cleaned'] = ""
# for row in range(df.shape[0]):
#   # text = ""
#   checker =  df.iloc[row]['TextBody']
#   text = checker
#   # # find this words 
#   # if "Thanks" in checker:
#   #   text = checker.partition('Thanks')[0]+" Thanks"

#   # if "From" in checker:
#   #   text = checker.partition('From')[0]
#   # # if "Sincerely" in checker
#   # else:
#   #   text = checker.partition('\n\n\n')[0]

#   text = removeNonAsciiChars(text)
#   text = remove_keywords(text)
#   text = text_cleaning(text)
#   text = getRidPunct(text)
#   text = removePhone(text)
#   text = partText(text)

#   df.loc[row]['cleaned'] = text