##Build Cantonese emotion lexicon

The cantonoese sentiment lexicon is built based on a Cantonese-English Dictionary. Then IBM Watson tone analyser is used to compute tone and sentiment score for words. The service can return results for the following tone IDs: anger, fear, joy, and sadness (emotional tones); analytical, confident, and tentative (language tones). The service returns results only for tones whose scores meet a minimum threshold of 0.5.

In [0]:
pip install --upgrade ibm-watson

In [0]:
import numpy as np
import pandas as pd
import jieba
from pprint import pprint
from ibm_watson import ToneAnalyzerV3
import json


In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1Wy75DNIEOx8rpYBVQVU-TP4nDTnlC_16'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('cccanto-webdist.txt')  

In [0]:
# read the original canto-eng dict in
with open("cccanto-webdist.txt", encoding="utf8") as f:
    content = [line.split("/") for line in f]

#remove irrelenvent information   
words_list = content[13:]

def remove_romanization(line):
    words = line.split()
    first_word = words[0]
    return first_word

In [0]:
#calling api from IBM cloud
tone_analyzer = ToneAnalyzerV3(
    version='2017-09-21',
    iam_apikey='AVWex58O1RxByFwP6vX1kAbK9tNNFdp_yQMOq7FEpsPo',
    url='https://gateway.watsonplatform.net/tone-analyzer/api'
)


### Build emotion lexicon

In [0]:
def get_tone(lst):
    sentence_list = []
    for sentence in lst:
        sentence_list = ','.join(lst)
    tone_analysis = tone_analyzer.tone(
    {'text': sentence_list},
    content_type='application/json').get_result()
    docu_tone = tone_analysis['document_tone']
    return docu_tone
  
emo_dict ={}
for lst in words_list:
    definition_list =[]
    try:
        for idx, elem in enumerate(lst):
            if len(elem) > 2:
                if idx != 0:
                    definition_list.append(elem)
                else:
                    #remove the remanization of the words
                    word_meaning = remove_romanization(elem)
                    #add the word as key into dict
                    emo_dict[word_meaning] = None
        #calculate the sentiment score for the word
        sscore = get_tone(definition_list)
        full_word = lst[0]
        word_key = remove_romanization(full_word)
        #update the sentiment score for the word in dict
        emo_dict[word_key] = sscore
    except:
        pass 

## Analyse emotional tones in Cantonese posts

In [0]:
# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1hBPhrxATm88P7glLNitOvPKdv7K7tNcz'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile("HKSocialMedia.csv")


# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1x_CcPBL8iowGWq7NpG7jrnEXoGKUdMD9'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('stopwords.txt') 


In [0]:
data = pd.read_csv("HKSocialMedia.csv")
posts = data['post_text']
print(posts)

### pre-processing 

In [0]:
#Remove punctuations
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '《', '》', '【', '】','▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '「','」','»', '！', '。','，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def remove_punctuation(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, '')
    return x

texts = []
for line in posts:
    sentence = remove_punctuation(line)
    texts.append(sentence)

#remove stopwords
import codecs

def seg_word(sentence):
    #segmentation
    seg_list = jieba.cut(sentence)
    seg_result = []
    for w in seg_list:
        seg_result.append(w)
    #read stop_words
    stopwords = set()
    fr = codecs.open('stopwords.txt', 'r', 'utf-8')
    for word in fr:
        stopwords.add(word.strip())
    fr.close()
    return list(filter(lambda x: x not in stopwords, seg_result)) 
  
seg_res = []
for line in texts:
    new_line = seg_word(line)
    seg_res.append(new_line)
pprint(seg_res[:3])

In [0]:
def get_sen_score(lst):
    score_list = []
    avg_dict = {}
    for word in lst:
        if word in emo_dict:
            score = emo_dict[word]
            score_list.append(score)
    for score_dict in score_list:
        for key in score_dict:
             if key not in avg_dict:
                 avg_dict[key] = score_dict[key]
              else:
                 avg_dict[key] = avg_dict[key] + score_dict[key]
    for key in avg_dict:
        avg_dict[key] = avg_dict[key]/len(score_list)
    return avg_dict
  
posts_sen_score = {}       
for idx, post in enumerate(seg_res):
    score = get_sen_score(post)
    full_post = posts[idx]
    posts_sen_score[full_post] = score
    
dict(list(posts_sen_score.items())[0:20])