In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fasttextmodel/fasttextvec.pt
/kaggle/input/fasttextmodel/IMDB_split.pt
/kaggle/input/fasttextmodel/FastText.model.wv.vectors_ngrams.npy
/kaggle/input/fasttextmodel/FastText.model
/kaggle/input/fasttextmodel/IMDB_cleaned.pt
/kaggle/input/fasttextmodel/data.csv


In [2]:
import torch
from torch.utils.data import Dataset
from tqdm.notebook import tqdm, trange
from gensim.models import Word2Vec,KeyedVectors,FastText

In [3]:
def load_fastText(path='../input/fasttextmodel/FastText.model'):
    myModel = FastText.load(path)
    vocab_npa = np.array(list(myModel.wv.key_to_index.keys()))
    embs_npa = []
    for w in vocab_npa:
        embs_npa.append(myModel.wv[w])
    embs_npa = np.stack(embs_npa)
    vocab_npa = np.insert(vocab_npa, 0, '<pad>')
    vocab_npa = np.insert(vocab_npa, 1, '<unk>')
    pad_emb_npa = np.zeros((1,embs_npa.shape[1]))
    unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)
    embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))
    my_embedding_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float())
    return vocab_npa, my_embedding_layer

In [4]:
fastVocab,fastEmbeddings = load_fastText()

In [5]:
class CNN_Model(torch.nn.Module):
    def __init__(self, vocab, embedding_layer, n_gram=3, max_words=100):
        super(CNN_Model, self).__init__()
        self.embedding_layer = embedding_layer
        self.cnn1 = torch.nn.Conv1d(100, 50, n_gram)
        self.max_pool = torch.nn.MaxPool1d(embedding_layer.weight.shape[1] - int(2*(n_gram//2)))
        self.linear_1 = torch.nn.Linear(50, 32)
        self.linear_2 = torch.nn.Linear(32, 3)
        self.relu = torch.nn.ReLU()
        self.vocab = [word for word in vocab]
        self.max_words = max_words
        self.numpy_flag = False
        self.softmax = torch.nn.Softmax(dim=1)
        
    def sentence_to_idx(self, inputs):
        self.vocab = [word for word in fastVocab]
        inputs = [[self.vocab.index(word) if word in self.vocab else 0 for word in sentence.split()[:100]] for sentence in inputs]
        inputs = [x+[0]*(self.max_words-len(x)) for x in inputs]
        return torch.tensor(inputs)
    
    def get_cnn_embeds(self, inputs):
        inputs = self.sentence_to_idx(inputs)
        embeds = self.embedding_layer(inputs)
        x = self.cnn1(embeds)
        x = self.max_pool(x)
        x = self.relu(x)
        x = torch.flatten(x, 1)
        return x
    
    def get_linear(self, x):
        self.numpy_flag = False
        if type(x) == np.ndarray:
            x = torch.from_numpy(x)
            x = x.float()
            self.numpy_flag = True
        x = self.linear_1(x)
        x = self.relu(x)
        x = self.linear_2(x)
        if self.numpy_flag:
            x = self.softmax(x)
            x = x.detach().numpy()
        return x
        
    def forward(self, inputs):
        x = self.get_cnn_embeds(inputs)
        x = self.get_linear(x)
        return x

In [6]:
n_gram = 3
model = CNN_Model(fastVocab, fastEmbeddings, n_gram=n_gram)

In [7]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)

In [8]:
class SentencesDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    # get items    
    def __getitem__(self, idx):
        return {'sentence':self.data[idx][0], 'label': 1 if self.data[idx][1] == 'positive' else 0}

In [9]:
train = pd.read_csv('../input/fasttextmodel/data.csv')
# train = train.values
del train['Unnamed: 0']
train.rename(columns={'0': 'sentence', '1': 'label'})


Unnamed: 0,sentence,label
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically s family little boy jake thinks s ...,negative
4,petter mattei s love time money visually stu...,positive
...,...,...
49995,thought movie right good job nt creative orig...,positive
49996,bad plot bad dialogue bad acting idiotic di...,negative
49997,catholic taught parochial elementary schools n...,negative
49998,m going disagree previous comment side maltin ...,negative


In [10]:
train_small = train[:100]

Unnamed: 0,0,1
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically s family little boy jake thinks s ...,negative
4,petter mattei s love time money visually stu...,positive
...,...,...
95,daniel daylewis versatile actor alive english...,positive
96,guess would originally going least two parts ...,negative
97,well like watch bad horror bmovies cause thi...,negative
98,worst movie ever seen well worst probably ev...,negative


In [27]:
train_small = train_small.values

In [11]:
train = train.values
sentences = train.T[1]

number_of_words_per_sentence = []
for sentence in sentences:
    number_of_words_per_sentence.append(len(sentence))
number_of_words_per_sentence = np.array(number_of_words_per_sentence)
print('mean', np.mean(number_of_words_per_sentence))
print('std', np.std(number_of_words_per_sentence))

mean 8.0
std 0.0


In [12]:
train[0][1]

'positive'

In [28]:
train_data_small = SentencesDataset(train_small)

In [14]:
train_dataset = SentencesDataset(train)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=50, shuffle=False)
model.embedding_layer.weight.requires_grad = True

In [15]:
print(train_dataset.__getitem__(0))

{'sentence': 'one reviewers mentioned watching 1 oz episode ll hooked  right  exactly happened methe first thing struck oz brutality unflinching scenes violence  set right word go  trust  show faint hearted timid  show pulls punches regards drugs  sex violence  hardcore  classic use wordit called oz nickname given oswald maximum security state penitentary  focuses mainly emerald city  experimental section prison cells glass fronts face inwards  privacy high agenda  em city home manyaryans  muslims  gangstas  latinos  christians  italians  irish  so scuffles  death stares  dodgy dealings shady agreements never far awayi would say main appeal show due fact goes shows would nt dare  forget pretty pictures painted mainstream audiences  forget charm  forget romance  oz nt mess around  first episode ever saw struck nasty surreal  could nt say ready  watched  developed taste oz  got accustomed high levels graphic violence  violence  injustice  crooked guards ll sold nickel  inmates ll kill or

In [16]:
def train_model():
    for epoch in range(10):
        running_loss, running_acc = 0.0, 0.0
        bar = tqdm(enumerate(train_loader), total=train_loader.__len__())
        for batch_idx, data in bar:
            inputs = data['sentence']
            labels = data['label']
            outputs = model(inputs)
            optimizer.zero_grad()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            acc = torch.mean((torch.argmax(outputs, 1)==labels).float())
            running_loss += loss.item()
            running_acc += acc.item()
            bar.set_description(str({'epoch':epoch+1, 'loss': round(running_loss/(batch_idx+1), 4), 'acc': round(running_acc/(batch_idx+1), 4)}))
            bar.refresh()
        bar.close()
        if epoch%10==0:
            torch.save({'vocab': fastVocab, 'embedding_layer': fastEmbeddings, 'n_gram':n_gram, 'params': model.state_dict()}, 'model_epoch'+str(epoch)+'.pt')

In [17]:
train_model()

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [18]:
import lime
import lime.lime_tabular

In [23]:
def test(train_dataset):
    loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=8, shuffle=False)
    model.eval()
    bar = tqdm(enumerate(loader), total=100)#len(train_loader))
    for batch_idx, data in bar:
        inputs = data['sentence']
        labels = data['label']
        if batch_idx == 0:
            n_gram_embeds = model.get_cnn_embeds(inputs)
        else:
            n_gram_embeds = torch.cat((n_gram_embeds, model.get_cnn_embeds(inputs)), 0)
    
    explainer = lime.lime_tabular.LimeTabularExplainer(n_gram_embeds.detach().numpy(), feature_names=['cnn'+str(i) for i in range(n_gram_embeds.shape[1])], class_names=['sentiment_'+str(i) for i in range(3)], discretize_continuous=True)
    
    N = n_gram_embeds.shape[0]
    cnn_w = model.cnn1.weight.detach().numpy()
    
    explained_cnn = []
    for i in trange(N):
        for ii in range(3):
            exp = explainer.explain_instance(n_gram_embeds[i].detach().numpy(), model.get_linear, num_features=50, num_samples=400, top_labels=ii)
            try:
                r = exp.as_list()
                break
            except:
                continue
        feature_importance = []
        for j in r:
            f = j[0].split()
            for k in f:
                if 'cnn' in k:
                    f = int(k.replace('cnn', ''))
                    break
            feature_importance.append([f, float(j[1])])
        feature_importance = np.array(feature_importance)
        index = np.argsort(feature_importance.T[0])
        feature_importance = feature_importance[index]
        weightage_carry = []
        for iii in range(cnn_w.shape[0]):
            w_per_input = np.mean(cnn_w[iii], axis=1) * feature_importance[iii][1]
            weightage_carry.append(w_per_input)
        weightage_carry = np.stack(weightage_carry)
        weightage_carry = np.mean(weightage_carry, axis=0)
        indexes = np.argsort(weightage_carry)[::-1]
        sentence = train_dataset.__getitem__(i)['sentence'].split()
        sentence += [' ' for i in range(model.max_words-len(sentence))]
        sentence = np.array(sentence)
        sentence = sentence[indexes]
        weightage_carry = weightage_carry[indexes]
        sentence_eval = np.array([[word, weightage_carry[index]] for index, word in enumerate(sentence) if word!=' '])
        sentence_eval_with_label = {'sentence': train_dataset.__getitem__(i)['sentence'], 'evals': sentence_eval, 'label': train_dataset.__getitem__(i)['label']}
        explained_cnn.append(sentence_eval_with_label)
        if i in [10, 50, 100]:
            torch.save(explained_cnn, 'explainations'+str(i)+'.pt')
            explained_cnn = []
    return explained_cnn

In [29]:
print(train_data_small.__getitem__(0))

{'sentence': 'one reviewers mentioned watching 1 oz episode ll hooked  right  exactly happened methe first thing struck oz brutality unflinching scenes violence  set right word go  trust  show faint hearted timid  show pulls punches regards drugs  sex violence  hardcore  classic use wordit called oz nickname given oswald maximum security state penitentary  focuses mainly emerald city  experimental section prison cells glass fronts face inwards  privacy high agenda  em city home manyaryans  muslims  gangstas  latinos  christians  italians  irish  so scuffles  death stares  dodgy dealings shady agreements never far awayi would say main appeal show due fact goes shows would nt dare  forget pretty pictures painted mainstream audiences  forget charm  forget romance  oz nt mess around  first episode ever saw struck nasty surreal  could nt say ready  watched  developed taste oz  got accustomed high levels graphic violence  violence  injustice  crooked guards ll sold nickel  inmates ll kill or

In [30]:
output = test(train_data_small)
torch.save(output, 'explainations.pt')

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]