In [1]:
import numpy as np
from lxml import etree
from copy import deepcopy
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, SubsetRandomSampler
from gensim.models.fasttext import FastText
from sklearn.metrics import f1_score



In [2]:
device = torch.device("cuda:0")

## Function to extract data from the XML file

In [3]:
def extract_XML_sentiment(xml, l):
    tree = etree.parse(xml)
    tweets = []
    sentiments = []
    sens = 0

    for t in tree.xpath("database/table/column"):
        attribut = t.get("name")        
        if (attribut == "text"):
            tweet = t.text
            tweet = tweet.replace('&amp;', '').replace('amp;', '').replace('nbsp;', '').replace('&quot;', '').replace('&gt;', '').replace('raquo;', '').replace('mdash;', '').replace('laquo;', '').replace('RT ', '').replace('&lt;', '')
        
        if attribut in l:
            if (t.text != "NULL"):
                sens += int(t.text)

        if (attribut == l[-1]):
            sens = np.sign(sens)
            if (sens != 0):
                tweets.append(tweet)
                sentiments.append(0 if sens == -1 else 1)
            sens = 0
    
    return [tweets, sentiments]

## Function to train and predict

In [4]:
def train_model(model, train_loader, loss, optimizer, num_epochs):      
    for epoch in range(num_epochs):
        model.train()  
        
        for i_step, (x, y) in enumerate(train_loader):
            x_gpu = x.to(device, dtype=torch.float)
            y_gpu = y.to(device, dtype=torch.long)
            prediction = model(x_gpu)
            loss_value = loss(prediction, y_gpu)
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()
        
        
def predict(model, test_loader):
    predictions = []
    model.eval()
    with torch.no_grad():
        for i, (x, _) in enumerate(test_loader):
            x_gpu = x.to(device, dtype=torch.float)

            prediction = model(x_gpu)
            _, indices = torch.max(prediction.data, 1)

            predictions.append(indices.cpu().numpy())
            
    return np.concatenate(predictions)

## Loading data from XML
#### For bank

In [5]:
banks_list = ["sberbank", "vtb", "gazprom", "alfabank", "bankmoskvy", "raiffeisen", "uralsib", "rshb"]

bank_data_train = extract_XML_sentiment("bank_train_2016.xml", banks_list)
bank_data_test = extract_XML_sentiment("banks_test_etalon.xml", banks_list)

#### For telecom

In [6]:
telecom_list = ["beeline", "mts", "megafon", "tele2", "rostelecom", "komstar", "skylink"]

telecom_data_train = extract_XML_sentiment("tkk_train_2016.xml", telecom_list)
telecom_data_test = extract_XML_sentiment("tkk_test_etalon.xml", telecom_list)

# Part 1: Character-level convolutional neural network

##### Alphabet

In [7]:
alphabet = np.array(["а", "б", "в", "г", "д", "е", "ё", "ж", "з", "и", "й", "к", "л", "м", "н", "о",
                     "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ъ", "ы", "ь", "э", "ю", "я",
                     "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "-", ",", ";", ".", "!", "?",
                     ":", "’", "\"", "«", "»", "/", "\\", "|", "_", "@", "#", "$", "%", "ˆ", "&", "*", 
                     "˜", "‘", "+", "-", "=", "<", ">", "(", ")", "[", "]", "{", "}", " "])

##### Quantization function

In [8]:
def list_quantization(list_text):
    list_quant = []
    for sentence in list_text:
        sentence_quant = []
        for letter in sentence:
            letter_quant = (alphabet == letter.lower())
            letter_quant = letter_quant * 1
            sentence_quant.append(letter_quant)
        list_quant.append(np.array(sentence_quant).T)
    return list_quant

## Preparing data 

##### Batch size

In [9]:
batch_size = 128

### Implement dataset

In [10]:
class charCNN_Dataset(Dataset):
    def __init__ (self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data[0])
        
    def __getitem__(self, index):
        tweet = self.data[0][index]
        miss_data = 140 - len(tweet.T)
        data_to_add = np.zeros((len(alphabet), miss_data))
        tweet = np.concatenate((tweet, data_to_add), axis = 1)
        
        sentiment = self.data[1][index]
        
        return tweet, sentiment

### Load bank data

In [11]:
charCNN_bank_data_train = deepcopy(bank_data_train)
charCNN_bank_data_test = deepcopy(bank_data_test)

#### Quantization of the tweets

In [12]:
charCNN_bank_data_train[0] = list_quantization(charCNN_bank_data_train[0])
charCNN_bank_data_test[0] = list_quantization(charCNN_bank_data_test[0])

#### Creating bank dataset

In [13]:
charCNN_bank_dataset_train = charCNN_Dataset(charCNN_bank_data_train)
charCNN_bank_dataset_test = charCNN_Dataset(charCNN_bank_data_test)

charCNN_bank_train_loader = torch.utils.data.DataLoader(charCNN_bank_dataset_train, batch_size = batch_size)
charCNN_bank_test_loader = torch.utils.data.DataLoader(charCNN_bank_dataset_test, batch_size = batch_size)

### Load telecom data

In [14]:
charCNN_telecom_data_train = deepcopy(telecom_data_train)
charCNN_telecom_data_test = deepcopy(telecom_data_test)

#### Quantization of the tweets

In [15]:
charCNN_telecom_data_train[0] = list_quantization(charCNN_telecom_data_train[0])
charCNN_telecom_data_test[0] = list_quantization(charCNN_telecom_data_test[0])

#### Creating telecom dataset

In [16]:
charCNN_telecom_dataset_train = charCNN_Dataset(charCNN_telecom_data_train)
charCNN_telecom_dataset_test = charCNN_Dataset(charCNN_telecom_data_test)

charCNN_telecom_train_loader = torch.utils.data.DataLoader(charCNN_telecom_dataset_train, batch_size = batch_size)
charCNN_telecom_test_loader = torch.utils.data.DataLoader(charCNN_telecom_dataset_test, batch_size = batch_size)

## Character-level Convolutional Neural Networks 

In [17]:
class CharCNN(nn.Module):
    def __init__(self):
        super(CharCNN, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv1d(78, 256, kernel_size = 7, stride = 1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = 3))
        
        self.conv2 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size = 7, stride = 1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = 3))
        
        self.conv3 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size = 3, stride = 1),
            nn.ReLU())
        
        self.conv4 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size = 3, stride = 1),
            nn.ReLU())
        
        self.conv5 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size = 3, stride = 1),
            nn.ReLU())
        
        self.conv6 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size = 3, stride = 1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = 3))
        
        self.fc1 = nn.Sequential(
            nn.Linear(256, 1024),
            nn.ReLU(),
            nn.Dropout(0.5))
        
        self.fc2 = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(0.5))
        
        self.fc3 = nn.Linear(1024, 2)
        
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                m.weight.data.normal_(0, 0.05)
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.05)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)        
        x = x.view(x.size(0), -1)        
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        
        return x

## Training and testing

#### For bank data 

In [18]:
charCNN_bank = CharCNN()

charCNN_bank.type(torch.cuda.FloatTensor)
charCNN_bank.to(device)

loss = nn.CrossEntropyLoss().type(torch.cuda.FloatTensor)
optimizer = optim.Adam(charCNN_bank.parameters(), lr = 1e-3)

In [19]:
%%time

train_model(charCNN_bank, charCNN_bank_train_loader, loss, optimizer, 100)
        
charCNN_bank_predictions = predict(charCNN_bank, charCNN_bank_test_loader)

f1_macro_score = f1_score(charCNN_bank_data_test[1], charCNN_bank_predictions, average = 'macro')
f1_micro_score = f1_score(charCNN_bank_data_test[1], charCNN_bank_predictions, average = 'micro')

print("F1-measure with macro averaging = ", f1_macro_score)
print("F1-measure with micro averaging = ", f1_micro_score)

F1-measure with macro averaging =  0.5766053616967375
F1-measure with micro averaging =  0.7122774133083412
Wall time: 4min 2s


#### For telecom data

In [20]:
charCNN_telecom = CharCNN()

charCNN_telecom.type(torch.cuda.FloatTensor)
charCNN_telecom.to(device)

loss = nn.CrossEntropyLoss().type(torch.cuda.FloatTensor)
optimizer = optim.Adam(charCNN_telecom.parameters(), lr = 1e-3)

In [21]:
%%time

train_model(charCNN_telecom, charCNN_telecom_train_loader, loss, optimizer, 100)
        
charCNN_telecom_predictions = predict(charCNN_telecom, charCNN_telecom_test_loader)

f1_macro_score = f1_score(charCNN_telecom_data_test[1], charCNN_telecom_predictions, average = 'macro')
f1_micro_score = f1_score(charCNN_telecom_data_test[1], charCNN_telecom_predictions, average = 'micro')

print("F1-measure with macro averaging = ", f1_macro_score)
print("F1-measure with micro averaging = ", f1_micro_score)

F1-measure with macro averaging =  0.621301057533715
F1-measure with micro averaging =  0.7402164862614488
Wall time: 6min 4s


# Part 2: Convolutional Neural Networks for Sentence Classification
### Using a pretrained model

##### Load fasttext model 

In [22]:
fasttext_model = FastText.load_fasttext_format('cc.ru.300.bin')

##### Word to vector function

In [23]:
def convert_to_vectors(model, list_text):
    list_vect = []
    for sentence in list_text:
        sentence_vect = []
        for word in sentence.split():
            if word.lower() in model.wv.vocab:
                sentence_vect.append(model.wv[word])
            else:
                sentence_vect.append(np.zeros(model.vector_size))
        list_vect.append(np.array(sentence_vect).T)
    return list_vect

### Preparing data

##### Batch size

In [24]:
batch_size = 50

### Implement dataset

In [25]:
class SC_Dataset(Dataset):
    def __init__ (self, data, model_vect_size):
        self.data = data
        self.model_vect_size = model_vect_size
        
    def __len__(self):
        return len(self.data[0])
        
    def __getitem__(self, index):
        tweet = self.data[0][index]
        miss_data = 30 - len(tweet.T)
        data_to_add = np.zeros((self.model_vect_size, miss_data))        
        tweet = np.concatenate([tweet, data_to_add], axis = 1)
        
        sentiment = self.data[1][index]
        
        return tweet, sentiment

### Load bank data

In [26]:
sc_bank_data_train = deepcopy(bank_data_train)
sc_bank_data_test = deepcopy(bank_data_test)

##### Vectorization

In [27]:
sc_bank_data_train[0] = convert_to_vectors(fasttext_model, sc_bank_data_train[0])
sc_bank_data_test[0] = convert_to_vectors(fasttext_model, sc_bank_data_test[0])

##### Creating bank dataset

In [28]:
sc_bank_dataset_train = SC_Dataset(sc_bank_data_train, fasttext_model.vector_size)
sc_bank_dataset_test = SC_Dataset(sc_bank_data_test, fasttext_model.vector_size)

sc_bank_train_loader = torch.utils.data.DataLoader(sc_bank_dataset_train, batch_size = batch_size)
sc_bank_test_loader = torch.utils.data.DataLoader(sc_bank_dataset_test, batch_size = batch_size)

### Load telecom data

In [29]:
sc_telecom_data_train = deepcopy(telecom_data_train)
sc_telecom_data_test = deepcopy(telecom_data_test)

##### Vectorization

In [30]:
sc_telecom_data_train[0] = convert_to_vectors(fasttext_model, sc_telecom_data_train[0])
sc_telecom_data_test[0] = convert_to_vectors(fasttext_model, sc_telecom_data_test[0])

##### Creating telecom dataset

In [31]:
sc_telecom_dataset_train = SC_Dataset(sc_telecom_data_train, fasttext_model.vector_size)
sc_telecom_dataset_test = SC_Dataset(sc_telecom_data_test, fasttext_model.vector_size)

sc_telecom_train_loader = torch.utils.data.DataLoader(sc_telecom_dataset_train, batch_size = batch_size)
sc_telecom_test_loader = torch.utils.data.DataLoader(sc_telecom_dataset_test, batch_size = batch_size)

## Sentence Classification Convolutional Neural Network

In [32]:
class SentClassCNN(nn.Module):
    def __init__(self):
        super(SentClassCNN, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv1d(300, 100, kernel_size = 3),
            nn.ReLU())
        
        self.conv2 = nn.Sequential(
            nn.Conv1d(300, 100, kernel_size = 4),
            nn.ReLU())
        
        self.conv3 = nn.Sequential(
            nn.Conv1d(300, 100, kernel_size = 5),
            nn.ReLU())
        
        self.pool1 = nn.MaxPool1d(28)
        self.pool2 = nn.MaxPool1d(27)
        self.pool3 = nn.MaxPool1d(26)
        
        self.drop = nn.Dropout(0.5)
        
        self.fc1 = nn.Linear(300, 2)
        
        
    def forward(self, x):
        c1 = self.conv1(x)
        c2 = self.conv2(x)
        c3 = self.conv3(x)
        
        p1 = self.pool1(c1).squeeze(2)
        p2 = self.pool1(c2).squeeze(2)
        p3 = self.pool1(c3).squeeze(2)
        
        cat = torch.cat([p1, p2, p3], dim = 1)
        d = self.drop(cat)
    
        fc = self.fc1(d)
        
        return fc        

## Training and Testing

#### For bank data

In [33]:
scCNN_bank = SentClassCNN()

scCNN_bank.type(torch.cuda.FloatTensor)
scCNN_bank.to(device)

loss = nn.CrossEntropyLoss().type(torch.cuda.FloatTensor)
optimizer = optim.Adam(scCNN_bank.parameters(), lr = 1e-3)

In [34]:
%%time

train_model(scCNN_bank, sc_bank_train_loader, loss, optimizer, 30)

sc_bank_predictions = predict(scCNN_bank, sc_bank_test_loader)

f1_macro_score = f1_score(sc_bank_data_test[1], sc_bank_predictions, average = 'macro')
f1_micro_score = f1_score(sc_bank_data_test[1], sc_bank_predictions, average = 'micro')

print("F1-measure with macro averaging = ", f1_macro_score)
print("F1-measure with micro averaging = ", f1_micro_score)

F1-measure with macro averaging =  0.7825898509501852
F1-measure with micro averaging =  0.837863167760075
Wall time: 29.5 s


#### For telecom data

In [35]:
scCNN_telecom = SentClassCNN()

scCNN_telecom.type(torch.cuda.FloatTensor)
scCNN_telecom.to(device)

loss = nn.CrossEntropyLoss().type(torch.cuda.FloatTensor)
optimizer = optim.Adam(scCNN_telecom.parameters(), lr = 1e-3)

In [36]:
%%time

train_model(scCNN_telecom, sc_telecom_train_loader, loss, optimizer, 30)

sc_telecom_predictions = predict(scCNN_telecom, sc_telecom_test_loader)

f1_macro_score = f1_score(sc_telecom_data_test[1], sc_telecom_predictions, average = 'macro')
f1_micro_score = f1_score(sc_telecom_data_test[1], sc_telecom_predictions, average = 'micro')

print("F1-measure with macro averaging = ", f1_macro_score)
print("F1-measure with micro averaging = ", f1_micro_score)

F1-measure with macro averaging =  0.7373973696006297
F1-measure with micro averaging =  0.8667776852622814
Wall time: 44.6 s


# Part 3:
### Using our model

##### Function to build our FastText model

We split the csv in 6 parts and we start to train a model with one part than we train it with the others part and save it on the disk. This avoid memory error.

In [None]:
"""
def create_model():
    model = FastText(size = 300)
    for i in range(0, 6):
        print(i)
        tweets = pd.read_csv('tweets/tweets_%d.csv' %i, names = ['tweets'])
        tweets = tweets.values.reshape(len(tweets)).tolist()
        tweets = list(map(str, tweets))
        tweets = list(map(lambda t : t.split(), tweets))
        
        if (i != 0):
            model.build_vocab(sentences = tweets, update = True)
        else:
            model.build_vocab(sentences = tweets)

        model.train(sentences = tweets, total_examples = len(tweets), epochs = 5)
    
    model.save('model/tweets_model')
    del tweets
    del model
    
create_model()
"""

##### Loading our model

In [38]:
tweets_model = FastText.load('model/tweets_model')

##### Batch size

In [39]:
batch_size = 50

### Load bank data

In [40]:
sc2_bank_data_train = deepcopy(bank_data_train)
sc2_bank_data_test = deepcopy(bank_data_test)

##### Vectorization

In [41]:
sc2_bank_data_train[0] = convert_to_vectors(tweets_model, sc2_bank_data_train[0])
sc2_bank_data_test[0] = convert_to_vectors(tweets_model, sc2_bank_data_test[0])

##### Creating bank dataset

In [42]:
sc2_bank_dataset_train = SC_Dataset(sc2_bank_data_train, tweets_model.vector_size)
sc2_bank_dataset_test = SC_Dataset(sc2_bank_data_test, tweets_model.vector_size)

sc2_bank_train_loader = torch.utils.data.DataLoader(sc2_bank_dataset_train, batch_size = batch_size)
sc2_bank_test_loader = torch.utils.data.DataLoader(sc2_bank_dataset_test, batch_size = batch_size)

### Load telecom data

In [43]:
sc2_telecom_data_train = deepcopy(telecom_data_train)
sc2_telecom_data_test = deepcopy(telecom_data_test)

##### Vectorization

In [44]:
sc2_telecom_data_train[0] = convert_to_vectors(tweets_model, sc2_telecom_data_train[0])
sc2_telecom_data_test[0] = convert_to_vectors(tweets_model, sc2_telecom_data_test[0])

##### Creating telecom dataset

In [45]:
sc2_telecom_dataset_train = SC_Dataset(sc2_telecom_data_train, tweets_model.vector_size)
sc2_telecom_dataset_test = SC_Dataset(sc2_telecom_data_test, tweets_model.vector_size)

sc2_telecom_train_loader = torch.utils.data.DataLoader(sc2_telecom_dataset_train, batch_size = batch_size)
sc2_telecom_test_loader = torch.utils.data.DataLoader(sc2_telecom_dataset_test, batch_size = batch_size)

## Training and Testing

#### For bank data

In [46]:
sc2CNN_bank = SentClassCNN()

sc2CNN_bank.type(torch.cuda.FloatTensor)
sc2CNN_bank.to(device)

loss = nn.CrossEntropyLoss().type(torch.cuda.FloatTensor)
optimizer = optim.Adam(sc2CNN_bank.parameters(), lr = 1e-3)

In [47]:
%%time

train_model(sc2CNN_bank, sc2_bank_train_loader, loss, optimizer, 30)

sc2_bank_predictions = predict(sc2CNN_bank, sc2_bank_test_loader)

f1_macro_score = f1_score(sc2_bank_data_test[1], sc2_bank_predictions, average = 'macro')
f1_micro_score = f1_score(sc2_bank_data_test[1], sc2_bank_predictions, average = 'micro')

print("F1-measure with macro averaging = ", f1_macro_score)
print("F1-measure with micro averaging = ", f1_micro_score)

F1-measure with macro averaging =  0.8079423731961528
F1-measure with micro averaging =  0.852858481724461
Wall time: 29.8 s


#### For telecom data

In [48]:
sc2CNN_telecom = SentClassCNN()

sc2CNN_telecom.type(torch.cuda.FloatTensor)
sc2CNN_telecom.to(device)

loss = nn.CrossEntropyLoss().type(torch.cuda.FloatTensor)
optimizer = optim.Adam(sc2CNN_telecom.parameters(), lr = 1e-3)

In [49]:
%%time

train_model(sc2CNN_telecom, sc2_telecom_train_loader, loss, optimizer, 30)

sc2_telecom_predictions = predict(sc2CNN_telecom, sc2_telecom_test_loader)

f1_macro_score = f1_score(sc2_telecom_data_test[1], sc2_telecom_predictions, average = 'macro')
f1_micro_score = f1_score(sc2_telecom_data_test[1], sc2_telecom_predictions, average = 'micro')

print("F1-measure with macro averaging = ", f1_macro_score)
print("F1-measure with micro averaging = ", f1_micro_score)

F1-measure with macro averaging =  0.7373370658735019
F1-measure with micro averaging =  0.8501248959200666
Wall time: 44.5 s
