<a href="https://colab.research.google.com/github/TyBuie/DatasetCustomerSupportTweets/blob/main/Data_Cleaning_For_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Known Issues
### ---------------------------------------------------------------------------------------------------------------------------------------------------------
### 1. Translator API only allows for 50 requests per day, so we can translate only 50 sentences. You are good to go if you have a paid version
### 2. Emoji.demojize takes a long time to run on full dataset, I could only run it on 1 million records
### ----------------------------------------------------------------------------------------------------------------------------------------------------------

### Create Anaconda environment and install the below packages using conda/pip

In [None]:
#pip install emoji
#pip install ftfy
#pip install nltk
#pip install spacy

In [None]:
import os

# Python utilities
!import ftfy
import pandas as pd
import numpy as np
import re
import parser
import time
from datetime import datetime
import pickle
!import emoji

### Text processing

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
from google_trans_new import google_translator
from nltk.stem import WordNetLemmatizer

In [None]:
### Handy Functions

### Ensure that the text encoding is utf
def encode_data(text):
    text = ftfy.fix_encoding(text)
    return text
### Convert text to lowercase
def to_lower(text):
    return text.lower()
### Add items to stopwords
def add_to_stopwords(set_stopwords):
    for w in set_stopwords:
        nlp.vocab[w].is_stop = True

# Translator
def translate(text):
    time.sleep(2)
    text_split = text.split()
    if translator.detect(text_split)[0]!='en':
        text = translator.translate(text)
    else:
        text = text
    return text

### Variables & Data structures

In [None]:
notebook_location = "C:\\Users\\yamin\\Desktop\\Fiverr\\" ## Add the location of the ipynb file
filename = "\\customer_support.csv" # provide input file name here, I used a subset of data
os.chdir(notebook_location) #changes your current working directory to where your notebook is
path = os.getcwd() + filename

In [None]:
translator = google_translator()
nlp = en_core_web_sm.load()
date_ = "_" + str(datetime.now()).split()[0]

In [None]:
### Used for contractions expansion , Abbreviation expansion ---- You can add your own

contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", 
    "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", 
    "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
    "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have",
    "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have", "i've": "i have", "isn't": "is not",
 "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
 "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
 "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
 "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
 "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have",
 "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would",
 "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", 
 "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have",
 "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
 "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will",
 "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have",
 "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is",
 "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
 "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
 "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
"you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have","i'm":"i am","i m\s":"i am ","im":"i am","i've":"i have","i'd":"i would","i'll":"i will","can't":"cannot","couldn't":"could not",
"that'll":"that will","how's":"how is","hasn't":"has not","haven't":"have not","e-mail":"email","e mail":"email","dm":"direct message","dms":"direct message"}

### Regex Cleaner Class contains all possible combinations of characters that need to be removed from the text. You can add or remove functions to/from this. It also contains the functions that need to be run to clean the text

In [None]:
class TextRegexCleaner:
    
    
    regex_data_cleaner = [
    'https?://\S+|www\.\S+', # Links
    '\S+@\S+',
    'From:(.*)\r\n',
    'Subject:',
    'Sent:(.*)\r\n',
    'Received:(.*)\r\n',
    'Received From:(.*)\r\n',
    'To:(.*)\r\n',
    'CC:(.*)\r\n',
    'IC:(.*)\r\n',
    'BCC:(.*)\r\n',
    'IiNnCc[0-9]*',
    'ticket[_]*[\\s]*[0-9]*',
    '\\[cid:(.*)]',
    '[0-9][\\-0–90-9 ]+',# phones
    '[0-9]',# numbers
    '[^a-zA-z 0-9]+',# anything that is not a letter
    '[\r\n]',# single letters
    ' [a-zA-Z] ',  # two-letter words
    '  ', # double spaces
    '\&\w*', # HTML special entities (e.g. &amp;)
    '^[_a-z0-9-]+(\\.[_a-z0-9-]+)*@[a-z0-9-]+(\\.[a-z0-9-]+)*(\\.[a-z]{2,4})$',
    '[\\w\\d\\-\\_\\.]+ @ [\\w\\d\\-\\_\\.]+',
    '[^a-zA-Z]',
    "\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", #IP Address
    "\s+", # to remove extra whitespaces
    ]
    
 

  ## Convert to Lower
    def __lower(self,text):
        return text.lower()
    
    def __is_date(self, word):
        try:
            parser.parse(word)
            return True
        except:
            return False
    
  ## Unecessary unicode characters that are not required and might be gibberish  
    def __remove_unwanted_unicodes(self, text):
        processed = ' '.join([w for w in text.split() if not self.__is_date(w)])
        processed= ''.join(c for c in processed if c <= '\uFFFF')
        return processed


    def __remove_unreadable_characters(self, text):
        unreadable_character_regex = "[^\u0030-\u0039\u0041-\u005a\u0061-\u007a]"
        processed = ' '.join(re.sub(unreadable_character_regex, " ", text).split())
        return processed.strip()
  
    ## Expand contractions like I'm to I am, using the contractions dictionary
    def __expand_contractions(self,text,key):
        text = text.replace(key,contraction_dict[key])
        return text  
    
    ## Remove all characters as defined in the regex_data_cleaner
    def __remove_text(self, text, regex):
        rgx = re.compile(regex)
        if re.search(rgx,text):
            text = rgx.sub(' ',text)
        else:
            text= text
        return text
    
    ## Remove all digits
    def __remove_digits(self, text):
        return ''.join(i for i in text if not i.isdigit())
   
    ## Removes only single character(e.g. hi I am a good s girl; removes s)
    def __remove_single_character_word(self, text):
        return ' '.join(i for i in text.split() if len(i) > 0)
    
    ## Removes unecessary spaces
    def __remove_spaces(self,text):
        return text.strip()
  
    ## Orchestrator that calls all the above methods  
    def process(self, text=None):
        if text is None:
            return text
    
        processed = text
        for key in contraction_dict.keys():
            processed = self.__expand_contractions(processed,key)
        for regex in self.regex_data_cleaner:
            processed = self.__remove_text(processed, regex)
        processed = self.__remove_unwanted_unicodes(processed)
        processed = self.__remove_unreadable_characters(processed)
        processed = self.__remove_digits(processed)
        processed = self.__remove_spaces(processed)

        return processed

### The below class removes stopwords(common words like the,a,an,any etc from the text and converts the word to its base form)

In [None]:
class Stopword_Lemmatizer:
    
    
    
    def __lemmatize(self,text,lemmatizer):
        tokens = re.split('\W+',text)
        text = [lemmatizer.lemmatize(token) for token in tokens]
        return ' '.join(text)
    

    def __remove_stopwords(self,text,stopwords):
        tokens = re.split('\W+',text)
        text = [word for word in tokens if word not in stopwords and word]
        return ' '.join(text)
    
    def process(self,text,stopwords,lemmatizer):
        if text is None:
            return text
        processed = text
        processed = self.__remove_stopwords(processed,stopwords)
        processed = self.__lemmatize(processed,lemmatizer)
        return processed
   

### 1. Read the input csv & print its shape & check its columns

In [None]:
df = pd.read_csv(path)
print(df.shape)

In [None]:
df = df[0:200000]

In [None]:
print(df.columns)

In [None]:
print(df.shape)

In [None]:
now = time.time()
df['text'] = df['text'].apply(encode_data)
print('minutes',(time.time() - now)/60)  # on 200,000 records

### 2. The tweets are in sequence and can be changed to a conversation or these can be handled as individual tweets

In [None]:
### returns userid that is present at the beginning of a tweet
def extract_user(text):
    ptn = re.compile('[^a-zA-Z0-9@]')
    otr = re.compile('[\r\n]')
    if text[0] == '@':
        text = text.split()[0]
        text = ptn.sub('',text)
        #text = otr.sub('',text)
    else:
        text = 'XXXXX'
    return text.lower()

#### these userids needs to be removed from the main text
def remove_user(text):
    if text[0] == '@':
        text=text.split()
        text=text[1:]
        text = ' '.join(text)
    return text.lower()

## utilities for creating the conversation key

dictionary={}
def convert_dict(d):
    for item in d:
        dictionary[item] = ''
    return dictionary
def set_conversion(text):
    txt = []
    text = text.split('@')
    for t in text:
        if (t.strip()) and (str(t) != 'nan'):
            txt.append(t)
    txt = set(txt)
    return txt

dict_ ={}
def create_key(col1, col2):
    col1 = tuple(col1)
    dict_[col1] = col1[0] + '_' +str(col2)
    return dict_
def replace_key(text):
    text = tuple(text)
    return key_dict[0][text]

def split_sort(text):
    text = text.split(',')
    text = [float(i) for i in text if str(i) != 'nan']
    return sorted(text,reverse=True)

### 3. Remove UserIDs from the main text

In [None]:
df['text_stripped'] = df['text'].apply(remove_user)

### 4. Extracting combination of author_id & user_id  for each tweet so that a conversation key can be created

In [None]:
df['sent_to_user'] = df['text'].apply(extract_user)
df['author_id'] = df['author_id'].astype(str)
df['author_id'] = df['author_id'].apply(to_lower)
df['from_user_to'] = df['author_id']+ df['sent_to_user']
df['sent_to_user_concatenated'] = df['sent_to_user'] + '@' + df['author_id']
df['involved_user_ids'] = df['sent_to_user_concatenated'] + '@' + df['from_user_to']

### 5. Using the extracted author_id & user_id from above to add to the stopwords list

In [None]:
user_set = set(df['sent_to_user'])
author_set = set(df['author_id'])
stopwords_user_set = user_set | author_set
add_to_stopwords(stopwords_user_set)

### 6. Creating a conversation key, that will bind together multiple tweets that belong to the same conversation using the author_id & user_id extracted above

In [None]:
df['from_user_to'] = df['sent_to_user_concatenated'].apply(set_conversion)
key_dict = df.apply(lambda x: create_key(x['from_user_to'], x['tweet_id']), axis=1)
df['conversation_key'] = df['from_user_to'].apply(replace_key)

### Printing the conversation keys

In [None]:
df['conversation_key'].value_counts() ## tweets having the same conversation_key are part of the same conversation

### 7. Translating emojis to text

In [None]:
now = time.time()
df['text_stripped'] = df['text_stripped'].apply(lambda x: emoji.demojize(x)) 
print("minutes",(time.time() - now)/60)  ## for 200,000 records

### 8. Translating to english - Some tweets are in a different language, so we can use the translator API to translate. But this API has a limit on the number of requests that we make. It's 50 currently... 

#### 1. Manually translate other language tweets in the csv using Google Translate Web UI
#### 2. Manually assess how many other language tweets are there and then take only those tweets and batchwise translate from multiple user accounts using translator API. 50 per batch
#### 3. Remove all other language tweets (Not recommended)


In [None]:
### This is a crude way to separate English and non English tweets using patterns - you may use it if not able to use the translator

english_pattern = re.compile('[a-zA-Z0-9@,.!-/\~#*&^%\'?]')
def detect_language(text,ptn):
    if ptn.search(text):
        text = ptn.sub('',text)
        if text.strip():
            return 'ne'
        else:
            return 'en'
    else:
        return 'ne'

In [None]:
now = time.time()
df['language'] = df['text_stripped'].apply(detect_language,args=(english_pattern,))
print("minutes",(time.time() - now)/60) ## for 200,000 records

In [None]:
### Translation examples

print(df.iloc[234]['text_stripped'])
print("-"*125)
print(translate(df.iloc[234]['text_stripped']))
print("-"*125)
print(df.iloc[1195]['text_stripped'])
print("-"*125)
print(translate(df.iloc[1195]['text_stripped']))

### 9. Grouping together tweets with the same conversation_key. Going to filter only the english records as I am unable to translate, you can ignore the first step(filtering by language == 'en') if using the full dataset and are able to translate all the non english tweets

In [None]:
df = df[df['language'] == 'en'] 

In [None]:
print(df.shape) ### out of 200,000 ;114,392 were identified as English

In [None]:
now = time.time()
df['conversation'] = df.groupby('conversation_key')['text_stripped'].transform(lambda x: ' '.join(x))
print("minutes",(time.time() - now)/60)  ## for 200,000 records

In [None]:
df['conversation'].iloc[0]

In [None]:
df = df.drop_duplicates(subset='conversation',keep='first')

In [None]:
print(df.shape)  ## size reduced to 58,936 from 114,392

### 10. Removing Special Characters,numbers,links,emails,extra whitespaces,trailing and leading spaces

In [None]:
df_process = df.copy()

In [None]:
clean_text = TextRegexCleaner()
now = time.time()
df_process['cleaned_text'] = df_process['conversation'].apply(clean_text.process)
print('minutes',(time.time() - now)/60) ### on 58,936 records

In [None]:
df_process.columns

In [None]:
print(df_process.loc[99]['cleaned_text'])
print(df_process.loc[99]['text_stripped'])

## Stopwords removal and lemmatization, 

### 1. Words like 'a','the','any','an' etc are very frequent and common and so should be removed from our corpus, ### 2.  using and used and use can all be converted to their base form "use" , called lemmatization

In [None]:
stopword_lemmatizer = Stopword_Lemmatizer()
stopwords = set(STOP_WORDS)
lemmatizer = WordNetLemmatizer()

In [None]:
now = time.time()
df_process['feature_text'] = df_process['cleaned_text'].apply(stopword_lemmatizer.process,args=(stopwords,lemmatizer,))
print("minutes",(time.time() - now)/60) # on  58,936 records

In [None]:
model_subset = ['tweet_id','conversation_key','feature_text']
df = df_process[model_subset]

In [None]:
df.to_pickle("processed.pkl")

### End of Data Cleaning & Preprocessing

### Suggestions

### 1. Add more stopwords (remove Nouns using Parts Of Speech tagging,Named entity Recognition)
### 2. Try a different approach to creating the conversation key, using the tweet_id,response_tweet_id and in_response_to_tweet_id
### 3. Identify misspelled words and correct them or remove the misspelled words