### Sentiment Analysis

#### WEI Chen, Pierre-Yves Casanova

## Twitter2016 Dataset

Import Twitter2016 sentiment analysis datasets, containing training set, validation set, and test set.

All @,# tags, url links in text samples will be removed.

The output format will be csv files, each containing 2 columns "label" and "sentence".

Import from the txt file.

In [None]:
import numpy as np
import os

def read_tweets(path):
    labels = list([])
    data = list([])
    with open(path) as fp:
        for line in fp:
#             print(line)
            sentence = line.split()[2:]
            sentence = ' '.join(sentence)
            sentence = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', sentence, flags=re.MULTILINE).replace("@","").replace("#","")
            data.append(sentence)
            label =  line.split()[1]
            if label=='positive':
                labels.append(1)
            if label=='negative':
                labels.append(-1)
            if label=='neutral':
                labels.append(0)
    return data, np.array(labels)   

data_dir = 'tweets_data/'
trX, trY = read_tweets(os.path.join(data_dir, 'twitter-2016train-A.txt'))
vaX, vaY = read_tweets(os.path.join(data_dir, 'twitter-2016dev-A.txt'))
teX, teY = read_tweets(os.path.join(data_dir, 'twitter-2016test-A.txt'))

print(len(trX),len(vaX),len(teX))

Save the trinary form to csv

In [None]:
import csv
with open('tweets2016_dev.csv', "w", newline='') as output:
    writer = csv.writer(output, delimiter=',')
    writer.writerow(["label","sentence"])
    for i in range(len(vaY)) :
        writer.writerow([vaY[i],vaX[i]])

Save the binary form by eliminating all neutral samples

In [None]:
import csv
with open('tweets2016_binary_dev.csv', "w", newline='') as output:
    writer = csv.writer(output, delimiter=',')
    writer.writerow(["label","sentence"])
    for i in range(len(vaY)) :
        if teY[i]!=0:
            writer.writerow([vaY[i],vaX[i]])

## OpinMind Dataset

Import from txt file

In [None]:
import numpy as np
import os

def read_opinmind(path):
    labels = list([])
    data = list([])
    with open(path, encoding='utf8') as fp:
        for line in fp:
#             print(line)
            sentence = line.split()[1:]
            sentence = ' '.join(sentence)
            data.append(sentence)
            label = line.split()[0]
#             print(type(label),label)
            if label=='1' or label=='\ufeff1':
                labels.append(1)
            elif label=='0':
                labels.append(-1)
            else:
                labels.append(label)
    return data, np.array(labels)
            

data_dir = 'opinmind_data/'
trX, trY = read_opinmind(os.path.join(data_dir, 'trainingdata.txt'))
teX, teY = read_opinmind(os.path.join(data_dir, 'testdata.txt'))

print(len(trX),trY.shape)
print(len(teX),teY.shape)

De-duplicate the training set

In [None]:
trX, unique_indices = np.unique(trX, return_index=True)
trX = trX.tolist()
trY = trY[unique_indices]

print(len(trX),trY.shape)
print(len(teX),teY.shape)

## CNN corpus

Load the file names respectively for training, validation and test sets

In [None]:
import pandas as pd
train_id = pd.read_csv('./cnn/train_story_ids.csv')
dev_id = pd.read_csv('./cnn/dev_story_ids.csv')
test_id = pd.read_csv('./cnn/test_story_ids.csv')

Read CNN news articles, remove the document header and footer.

In [None]:
def read_cnn(path):
    labels = list([])
    data = list([])
    with open(path, encoding='utf8') as fp:
        for sentence in fp:
            if sentence != '\n' and sentence != '@highlight\n':
#                 print(sentence)
                data.append(sentence.strip())
    return data

trX = list([])
for path in train_id['story_id']:
    trX = trX + read_cnn(path)
veX = list([])
for path in dev_id['story_id']:
    veX = veX + read_cnn(path)

Save training and validation corpus in two seperate txt files

In [None]:
text_file = open("cnn_training.txt", "w", encoding="utf-8")
text_file.write(trX)
text_file.close()

text_file = open("cnn_validation.txt", "w", encoding="utf-8")
text_file.write(veX)
text_file.close()

## Bloomberg corpus

Integrate seperate news article files into a single txt file, remove the document header and footer.

In [None]:
import json
import os
import pickle

file_num = 0
ignore_num = 0
with open('G:\\2017-2018 courses\\PRIM2017\\sentiment-discovery-master\\data\\Bloomberg2.txt', 'w') as f:
    for root, dirs, files in os.walk('G:\\2017-2018 courses\\PRIM2017\\financial-news-dataset-master\\20061020_20131126_bloomberg_news'):
        for file in files:
        # assert os.path.exists(path)
        # Count bytes
            with open(os.path.join(root, file), 'r', encoding='utf8') as lines:
                article = ''
                for i, line in enumerate(lines):
                    article += line.strip()+' '
#                 print(article)
                try:
#                     title = article.split(".html")[0].split("--")[1]
                    content = article.split(".html")[1].split("To contact the reporter on this story:")[0].strip()
                    f.write(content)
                    f.write("\n")
                except:
                    ignore_num += 1
                    if ignore_num%100 == 0:
                        print("* ignored file: ", ignore_num)
            file_num += 1
            if file_num%1000 == 0:
                print("* number of file: ", file_num)
print("* number of file: ", file_num)
print("* ignored file: ", ignore_num)

## MPQA2.0 Dataset

Load the documents, find the target sentence with contextual polarity tag

In [None]:
import pandas as pd
import os
import numpy as np
import nltk

df = pd.read_pickle("G:\\2017-2018 courses\\PRIM2017\\database.mpqa.2.0\\mpqa_features.pickle")
paths = df.index.values
keywords = df["context_"].values
sentiments = df['c_pol'].map(lambda k: 0 if k < 0 else 1).values

mpqa_senti = list()

for i, p in enumerate(paths):
    path = os.path.join("G:\\2017-2018 courses\\PRIM2017\\database.mpqa.2.0\\docs",p[0],p[1])
    with open(path, 'r', encoding='utf8') as file:
        document = file.read().replace('\n', '')
        sentences = nltk.sent_tokenize(document)
        for j, line in enumerate(sentences):
            if keywords[i][0] in line and keywords[i][1] in line and keywords[i][2] in line:
                mpqa_senti.append((sentiments[i],line))

Split the whole dataset as training(0.7), validation(0.2), test(0.1)

In [None]:
from sklearn.model_selection import train_test_split
mpqa_senti_train, mpqa_senti_test = train_test_split(mpqa_senti, test_size=0.3, random_state=42)
mpqa_senti_dev, mpqa_senti_test = train_test_split(mpqa_senti_test, test_size=0.33, random_state=42)

Save to csv file

In [None]:
import csv 
with open("mpqa2.0_test.csv", "w", newline='', encoding="utf8") as output:
    writer = csv.writer(output, delimiter=',')
    writer.writerow(["label","sentence"])
    for i in range(len(mpqa_senti_test)) :
        writer.writerow(mpqa_senti_test[i])

## IMDb dataset

imdb dataset, containing 50000 movie reviews.

Load from tsv file.

In [None]:
import pandas as pd       
data = pd.read_csv("imdb50000/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

X = data["review"]
y = data["sentiment"]

Split the whole dataset as training(0.7), validation(0.2), test(0.1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.33, random_state=42)

Save to csv file

In [None]:
import csv 

with open("imdb_dev.csv", "w", newline='', encoding="utf8") as output:
    writer = csv.writer(output, delimiter=',')
    writer.writerow(["label","sentence"])
    for i in range(len(X_dev)) :
        writer.writerow((y_dev.values[i], X_dev.values[i].replace("<br />", " ")))