In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
kaggle = True   # boolean setting whether the environment is Kaggle or not
saved_model = False   # saved, original (transformer)

In [3]:
# Scripts files
if kaggle:
    from shutil import copyfile
    copyfile(src = "/kaggle/input/quorascripts/input_net.py", dst = "../working/input_net.py")
    copyfile(src = "/kaggle/input/quorascripts/utils.py", dst = "../working/utils.py")

import utils
import input_net

# Packages
from os import path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import random
import datetime
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

# Transformers
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer

Using TensorFlow backend.


In [4]:
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


## Models

In [5]:
model_class = BertForSequenceClassification
tokenizer_class = BertTokenizer
tokenizer_model = 'bert-base-uncased'
tokenizer = tokenizer_class.from_pretrained(tokenizer_model, do_lower_case=True)

if saved_model:
    if kaggle:
        pretrained_model = "/kaggle/input/trained-model"
    else:
        pretrained_model = "models"
else:
    pretrained_model = 'bert-base-uncased'
model = model_class.from_pretrained(pretrained_model, num_labels=2, output_attentions = False, output_hidden_states = False).to(device)

if device == "cuda":
    model.cuda()
print("Model loaded")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


Model loaded


## Input

In [6]:
max_len = 70
if kaggle:
    INPUT_NET = '/kaggle/input/quora-data/data/data/70/input70_bert-base-uncased.csv'
    TRAIN = "/kaggle/input/quora-data/train.csv"
else:
    INPUT_NET = "data/input.csv"
    TRAIN = "data/train.csv"
df = pd.read_csv(INPUT_NET)
df_train = pd.read_csv(TRAIN).fillna("")

In [7]:
ids = df_train[["qid1", "qid2"]]

In [8]:
df = np.array(df)

X_i, X_s, X_p, y = utils.ToTensor(df[:,:-1],df[:,-1])

dataset = TensorDataset(X_i, X_s, X_p, y)

seed_val = 47
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [9]:
batch_size = 256

execution_dataloader = DataLoader(
    dataset, # The samples.
    sampler = SequentialSampler(dataset), # Pull out batches sequentially.
    batch_size = batch_size # Evaluate with this batch size.
)

print("Number of execution batches:", len(execution_dataloader))

Number of execution batches: 1580


In [10]:
def execute1():
    '''
    Execute the model in order to obtain predicted classes
    - output:   nodes: dictionary of sentences id
                classes: dictionary of classes
    '''
    nodes = {}
    classes = {}
    model.eval()

    start = time.time()
    with torch.no_grad():
        for step, batch in enumerate(execution_dataloader):
                
            y = batch[3].to(device)

            out = model(input_ids=batch[0].to(device), token_type_ids=batch[1].to(device), attention_mask=batch[2].to(device), labels=y)[1] # using bert
            out = F.softmax(out, dim=1)
            pred = torch.max(out, 1)[1]

            for i in range(len(y)):
                pos = step*batch_size + i
                f = ids.iloc[pos,0]
                s = ids.iloc[pos,1]
                utils.e(f,s, nodes, classes)
                if pred[i] and nodes[f] != nodes[s]:
                    utils.class_join(f,s, nodes, classes)
            
    end = time.time()
    epoch_time = utils.format_time(end-start)

    print("time:", epoch_time)

    return nodes, classes

In [11]:
nodes, classes = execute1()

time: 0:12:58


In [12]:
print("There are", len(classes), "different question classes")

There are 439365 different question classes


In [13]:
sizes = {}
for cl in classes:
    l = len(classes[cl])
    if not l in sizes:
        sizes[l] = 0
    sizes[l] += 1

assert sum(sizes.values()) == len(classes)

In [14]:
for size in sorted(sizes):
    print(size, ": ", sizes[size], sep="")

1: 375625
2: 50414
3: 7743
4: 2434
5: 1142
6: 603
7: 338
8: 234
9: 182
10: 106
11: 83
12: 63
13: 51
14: 43
15: 40
16: 24
17: 31
18: 15
19: 16
20: 9
21: 9
22: 16
23: 13
24: 5
25: 7
26: 7
27: 6
28: 9
29: 6
30: 7
31: 1
32: 4
33: 3
34: 3
35: 4
36: 3
37: 3
38: 3
39: 2
40: 2
41: 4
42: 3
43: 4
44: 1
45: 2
46: 1
47: 2
49: 1
50: 2
51: 1
52: 1
53: 1
56: 1
57: 2
58: 2
61: 1
63: 1
64: 4
65: 2
66: 1
70: 2
75: 1
76: 4
77: 1
82: 1
84: 2
97: 1
108: 1
110: 1
129: 2
145: 1
219: 1
260: 1


In [15]:
if kaggle:
    SENTENCES = "/kaggle/input/quora-data/sentences.csv"
else:
    SENTENCES = "data/sentences.csv"
sentences = pd.read_csv(SENTENCES)

In [16]:
N_sizes = 10
N_classes = 5
count = [N_classes]*N_sizes
sel_classes = {}

max_cl = None
for cl, cl_nodes in classes.items():
    size = len(cl_nodes)
    if size >= 3 and size < N_sizes+3 and count[size-3] > 0  or  size == max(sizes.keys()):
        if size == max(sizes.keys()):
            max_cl = cl  #we are curious to study the largest class that exists
        else:
            count[size-3] -= 1
        sel_classes[cl] = cl_nodes
    
    if sum(count) == 0 and max_cl:
        break

sel_classes

{25: [25, 26, 114035],
 27: [27, 28, 50277],
 41: [41, 42, 209447],
 50: [50, 233769, 233961, 357323],
 53: [53, 441812, 522141],
 63: [63, 64, 43889],
 99: [99, 100, 105314, 102419, 157960, 462304, 11667, 125444],
 107: [107, 108, 652, 208937, 128657, 128658, 248362, 78036],
 116: [116, 180358, 116185, 116186, 145706, 180988, 338518],
 125: [125,
  126,
  90436,
  90437,
  101336,
  163544,
  180911,
  142034,
  7417,
  7418,
  82067],
 130: [130, 141412, 141413, 169346, 198214],
 135: [135, 136, 10657, 38442, 38443, 52814, 101659, 8378, 8379, 12753, 12754],
 143: [143, 144, 219989, 462802],
 187: [187, 188, 68732, 68733, 144864, 202157, 364823, 434871, 394711, 394712],
 227: [227, 228, 112374, 46820, 46821, 232768],
 251: [251,
  252,
  33925,
  145833,
  114241,
  114242,
  82212,
  146944,
  272834,
  37817,
  37818,
  447995],
 255: [255, 256, 8031, 8032],
 268: [268, 274051, 274052, 430444],
 313: [313, 314, 71619, 71620, 355262],
 337: [337, 338, 133940, 133941, 233395, 320961],

We look at the sentences in the class with more questions

In [17]:
for nod in classes[max_cl]:
    print(sentences.iloc[nod,0])

How can I gain healthy weight and mass?
What kind of animal did this?
What is the best way to make a responsive grid of "diamond shapes"?
How can you speak and learn fluent English like Karan Johar?
Why does my DIRECTV guide read "to be announced"?
Doing excessive masturbation is a cause of less weight. How do I gain weight naturally?
What is the property tax rate in Granville, Ohio? How is it compared to the one of Washington?
In graphic cards which matters more, the number of CUDA cores or the speed of the graphic card?
What would happen if you cut out refined sugar entirely?
How can we tell if the blue I see is the same blue you see?
What are some of the best ways to gain weight in a healthy way?
What book can you recommend me, that gives an electrical engineer an introduction to financial & business world?
Is The Carbonaro Effect staged?
What is the advantage of being a woman in computer science?
I just read that Indian developers can sell their Android apps but I'm unable to chang

It has a lot of "random" sentences, but in fact, many questions talk about losing weight

We build the dataframe that will be processed to build the input of the model. We additionally store the ids of the questions as we will need them in order later on.

In [18]:
questions_ab = pd.DataFrame(columns = ["question1", "question2"])
ids = pd.DataFrame(columns = ["id1", "id2"])
ind = 0
for cl, cl_nodes in sel_classes.items():
    for i in range(len(cl_nodes)-1):
        for j in range(i+1, len(cl_nodes)):
            questions_ab.loc[ind, "question1"] = sentences.iloc[cl_nodes[i],0]
            questions_ab.loc[ind, "question2"] = sentences.iloc[cl_nodes[j],0]
            ids.loc[ind,"id1"] = cl_nodes[i]
            ids.loc[ind,"id2"] = cl_nodes[j]
            ind += 1
    
questions_ab["is_duplicate"] = [1] * ind

In [19]:
data = input_net.create_input(questions_ab, tokenizer_class, tokenizer_model, max_len, out_file = None)

Dataset length: 35095
Pretrained model used: bert-base-uncased
Maximum sequence length: 70


### Evaluation

In [22]:
data = np.array(data)

X_i, X_s, X_p, y = utils.ToTensor(data[:,:-1],data[:,-1])

dataset = TensorDataset(X_i, X_s, X_p, y)

seed_val = 47
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [23]:
execution_dataloader = DataLoader(
    dataset, # The samples.
    sampler = SequentialSampler(dataset), # Pull out batches sequentially.
    batch_size = batch_size # Evaluate with this batch size.
)

print("Number of execution batches:", len(execution_dataloader))

Number of execution batches: 138


In [34]:
def execute2():
    '''
    Predict all the combinations of sequences
    - output    acc: accuracy value
    '''
    acc = 0
    model.eval()
    
    start = time.time()
    with torch.no_grad():
        for step, batch in enumerate(execution_dataloader):
                
            y = batch[3].to(device)
            out = model(input_ids=batch[0].to(device), token_type_ids=batch[1].to(device), attention_mask=batch[2].to(device), labels=y)[1] # using bert
            out = F.softmax(out, dim=1)
            pred = torch.max(out, 1)[1]

            acc += sum(pred).item()
    
    acc = acc/len(data)*100
    end = time.time()    
    print("time:", utils.format_time(end-start))
    
    return acc

In [38]:
print("Accuracy is {:.2f}".format(execute2()),"%", sep="")

time: 0:01:06
Accuracy is 2.70%


Accuracy is extremely low.