In [1]:
import os
import sys
import json
import requests
import subprocess
import pandas as pd
import numpy as np

module_path = os.path.abspath(os.path.join('../src/scorers'))

if module_path not in sys.path:
    sys.path.append(module_path)

import utils # import pre_process_free_text

In [2]:
from importlib import reload  # Python 3.4+ only.

utils = reload(utils)

In [3]:
MINIMUM_WORD_COUNT = 3

### Sentiment towards products' file conversion
This file has been selected as group conversations are usually geared towards judging existing services and commenting about newly proposed services. Give this, it is clear that this resembles sentiment towards products/brands, which the following dataset captures.

To complement this dataset, we also use the movie sentiment dataset already available, but only use the very positive/negative comments made to remove noise from more neutral comments.

In [4]:
default_json_struct = {
    "eventTime": "2015-06-08T16:58:14.285+0000",
    "entityId": 73168,
    "entityType": "source",
    "properties": {
        "phrase": "real transformation",
        "sentiment": 2
    },
    "event": "phrases"
}

In [5]:
if not os.path.exists('judge-1377884607_tweet_product_company.csv'):    
    # "https://www.crowdflower.com/wp-content/uploads/2016/03/judge-1377884607_tweet_product_company.csv" -- Removed
    r = requests.get("https://query.data.world/s/5gxidpupmkcesf43vkltv4h6s7erlv")
    open('judge-1377884607_tweet_product_company.csv', 'wb').write(r.content)

In [6]:
sent_dataset_1 = pd.read_csv("judge-1377884607_tweet_product_company.csv", encoding='latin-1')

In [7]:
sent_dataset_1.is_there_an_emotion_directed_at_a_brand_or_product.value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

Filter for rows that have a sentiment recognised at all

In [8]:
sent_dataset_1_f = sent_dataset_1[sent_dataset_1['is_there_an_emotion_directed_at_a_brand_or_product'].isin(['Positive emotion', 'Negative emotion'])].copy()

In [9]:
sent_dataset_1_f['num_sentiment'] = 0

sent_dataset_1_f.loc[sent_dataset_1_f['is_there_an_emotion_directed_at_a_brand_or_product'] == 'Positive emotion', 'num_sentiment'] = 1

In [10]:
sent_dataset_1_f['num_sentiment'].value_counts()

1    2978
0     570
Name: num_sentiment, dtype: int64

In [11]:
entity_counter = 0

def process_text(entry, f_json):
    global entity_counter
    if entry['tweet_text'] is not np.nan:
        try:
            clean_text_l = utils.pre_process_free_text(entry['tweet_text'], lang_id=1, rm_stopwords=False)
            
            if len(clean_text_l) >= MINIMUM_WORD_COUNT:
                clean_text = ' '.join(clean_text_l)
                clean_text = clean_text.replace("'", "")

                json_struct = {
                    "entityId": entity_counter,
                    "entityType": "source",
                    "properties": {
                        "phrase": clean_text,
                        "sentiment": entry['num_sentiment']
                    },
                    "event": "phrases"
                }
                
                entity_counter += 1

                f_json.write(json.dumps(json_struct) + "\n")
                return clean_text
            else:
                return None
        except Exception as ex:
            print(ex)
            return None

f = open('product_sentiment.json', 'w')
sent_dataset_1_f['clean_tweet_text'] = sent_dataset_1_f.apply(process_text, axis=1, f_json=f)
print(entity_counter)
f.close()

3548


Prepare a JSON file in the format that PIO will accept it.

In [12]:
def prepare_properties_column(row):
    return {"phrase": row['clean_tweet_text'], "sentiment": row['num_sentiment']}

sent_dataset_1_f['entityId'] = sent_dataset_1_f.index
sent_dataset_1_f['entityType'] = "source"
sent_dataset_1_f['properties'] = sent_dataset_1_f.apply(prepare_properties_column, axis=1)
sent_dataset_1_f['event'] = "phrases"

sent_dataset_1_json = sent_dataset_1_f[['entityId', 'entityType', 'properties', 'event']]

In [13]:
sent_dataset_1_f.to_json("final_sentiment.json")

### Default sentiment file to 0 - 1 scale
For this part, we will need the file included by default in the text classification template under the name `template/sentimentanalysis.json`. Retrieve this and place it in the current working directory.

In [14]:
default_f = open('sentimentanalysis.json', 'r')
new_default = open('clean_default_sentiment_analysis.json', 'w')

In [15]:
# sentiment_map = {0: 0, 1: 0, 2: 0, 3: 1, 4: 1}
sentiment_map = {0: 0, 4: 1}

In [16]:
unique_words = set()

for line in default_f.readlines():
    line_json = json.loads(line)
    sent = line_json['properties']['sentiment']
    if sent in list(sentiment_map.keys()):
        new_sent = sentiment_map[sent]  # Re-map to a 0 - 1 scale.
        line_json['properties']['sentiment'] = new_sent

        line_json['properties']['phrase'] = line_json['properties']['phrase'].replace("'", "")  # .lower()
        words = line_json['properties']['phrase'].split(" ")
        unique_words.update(words)
        if len(words) >= MINIMUM_WORD_COUNT:
            # re-organise entityIds
            del line_json['eventTime']
            line_json['entityId'] = entity_counter  # Continue from where we left off.
            entity_counter += 1

            new_default.write(json.dumps(line_json) + '\n')

default_f.close()
new_default.close()

In [17]:
len(unique_words)

3445

In [20]:
try:
    output = subprocess.check_output(
        ['bash','-c', 
         "cat 'clean_default_sentiment_analysis.json' >> 'clean_default_sentiment_analysis.json'"])
except subprocess.CalledProcessError as e:
    raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))    