# GLUE

2021-03-18 Seongtae Kim

In [2]:
from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer, BertModel

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
print(tokenizer.tokenize("Tokenization Test"))
print(model.config)

100%|██████████| 407873900/407873900 [02:05<00:00, 3245945.98B/s]
100%|██████████| 231508/231508 [00:00<00:00, 396685.59B/s]

['token', '##ization', 'test']
{
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}






# Preprocessing GLUE Dataset

In [1]:
import pandas as pd
import os
import re

def preprocessor(input_path, output_path, tp="train", task_name = "sts"):
    
    data = [l.split("\t") for l in open(input_path).readlines()]
    col = data[0]
    data = data[1:]

    sheet = pd.DataFrame(train, columns=col)

    
    sent1 = list(sheet["sentence1"])
    
    try:
        sent2 = list(sheet["sentence2"])
    except KeyError:
        sent2 = None # single sentence task
    score = list(sheet["score\n"])

    d = []
    if sent2 is not None:
        for s1, s2, sc in zip(sent1, sent2, score):
            d.append([s1, s2, float(re.sub("\n", "", sc))])
    else:
        for s1, sc in zip(sent1, score):
            d.append([s1, float(re.sub("\n", "", sc))])
        
    pd.DataFrame(d).to_csv("./preprocessed/{}_{}.tsv".format(task_name, tp), sep="\t")

In [31]:
from utils import inspect_filesystem_from

In [65]:
files = inspect_filesystem_from("./data/", file_only=True, extension=["tsv"])
files = [f for f in files if "/test.tsv" in f or "/train.tsv" in f or "/dev.tsv" in f]

In [66]:
dic = {}
for file in files:
    spans = [f.span() for f in re.finditer("/", file)]
    _, e = spans[1]
    b, _ = spans[2]
    task = file[e:b]
    
    if task in dic:
        dic[task].append(file)
    else:
        dic[task] = [file]


In [67]:
dic

{'SST2': ['./data/SST2/test.tsv',
  './data/SST2/train.tsv',
  './data/SST2/dev.tsv'],
 'CoLA': ['./data/CoLA/dev.tsv',
  './data/CoLA/train.tsv',
  './data/CoLA/test.tsv'],
 'WNLI': ['./data/WNLI/dev.tsv',
  './data/WNLI/test.tsv',
  './data/WNLI/train.tsv'],
 'STSB': ['./data/STSB/test.tsv',
  './data/STSB/train.tsv',
  './data/STSB/dev.tsv'],
 'QQP': ['./data/QQP/dev.tsv', './data/QQP/test.tsv', './data/QQP/train.tsv'],
 'RTE': ['./data/RTE/train.tsv', './data/RTE/test.tsv', './data/RTE/dev.tsv'],
 'QNLI': ['./data/QNLI/dev.tsv',
  './data/QNLI/test.tsv',
  './data/QNLI/train.tsv'],
 'MNLI': ['./data/MNLI/train.tsv'],
 'MRPC': ['./data/MRPC/test.tsv']}

In [3]:
import re

def parse_tsv(path, col_nos=[]):
    if col_nos == []:
        return [re.split("\t", re.sub("\n", "", l)) for l in open(path).readlines()]
    else:
        data = [re.split("\t", re.sub("\n", "", l)) for l in open(path).readlines()]
        
        output = []
        for l in data:
            s=[]
            for c in col_nos:
                s.append(l[c])
            output.append(s)
        return output
        

# Preprocessing CoLA

In [74]:
dic["CoLA"]

['./data/CoLA/dev.tsv', './data/CoLA/train.tsv', './data/CoLA/test.tsv']

In [77]:
cola={}
for path in dic["CoLA"]:
    try:
        cola[path] = parse_tsv(path, [1, 3])
    except:
        cola[path] = parse_tsv(path)

In [81]:
cola['./data/CoLA/train.tsv'] = [[l[1], l[0]] for l in cola['./data/CoLA/train.tsv']]
cola['./data/CoLA/dev.tsv'] = [[l[1], l[0]] for l in cola['./data/CoLA/dev.tsv']]
cola['./data/CoLA/test.tsv'] = [l[1] for l in cola['./data/CoLA/test.tsv']]

In [86]:
pd.DataFrame(cola["./data/CoLA/train.tsv"]).to_csv("./preprocessed/CoLA_train.tsv", sep="\t")
pd.DataFrame(cola["./data/CoLA/dev.tsv"]).to_csv("./preprocessed/CoLA_dev.tsv", sep="\t")
pd.DataFrame(cola["./data/CoLA/test.tsv"][1:]).to_csv("./preprocessed/CoLA_test.tsv", sep="\t")

# Preprocessing SST2

In [101]:
dic["SST2"]

['./data/SST2/test.tsv', './data/SST2/train.tsv', './data/SST2/dev.tsv']

In [103]:
sst2={}
for path in dic["SST2"]:
    try:
        sst2[path] = parse_tsv(path, [1, 3])
    except:
        sst2[path] = parse_tsv(path)

In [104]:
sst2["./data/SST2/test.tsv"]

[['index', 'sentence'],
 ['0', 'uneasy mishmash of styles and genres .'],
 ['1',
  "this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation ."],
 ['2',
  'by the end of no such thing the audience , like beatrice , has a watchful affection for the monster .'],
 ['3', 'director rob marshall went out gunning to make a great one .'],
 ['4',
  'lathan and diggs have considerable personal charm , and their screen rapport makes the old story seem new .'],
 ['5',
  'a well-made and often lovely depiction of the mysteries of friendship .'],
 ['6',
  "none of this violates the letter of behan 's book , but missing is its spirit , its ribald , full-throated humor ."],
 ['7',
  "although it bangs a very cliched drum at times , this crowd-pleaser 's fresh dialogue , energetic music , and good-natured spunk are often infectious ."],
 ['8',
  'it is not a mass-market entertainment but an uncompromisin

In [105]:
sst2['./data/SST2/train.tsv'] = [[l[1], l[0]] for l in sst2['./data/SST2/train.tsv']]
sst2['./data/SST2/dev.tsv'] = [[l[1], l[0]] for l in sst2['./data/SST2/dev.tsv']]
sst2['./data/SST2/test.tsv'] = [l[1] for l in sst2['./data/SST2/test.tsv']]

pd.DataFrame(sst2["./data/SST2/train.tsv"][1:]).to_csv("./preprocessed/SST2_train.tsv", sep="\t")
pd.DataFrame(sst2["./data/SST2/dev.tsv"][1:]).to_csv("./preprocessed/SST2_dev.tsv", sep="\t")
pd.DataFrame(sst2["./data/SST2/test.tsv"][1:]).to_csv("./preprocessed/SST2_test.tsv", sep="\t")

# Preprocessing WNLI

In [106]:
dic["WNLI"]

['./data/WNLI/dev.tsv', './data/WNLI/test.tsv', './data/WNLI/train.tsv']

In [108]:
wnli={}
for path in dic["WNLI"]:
    try:
        wnli[path] = parse_tsv(path, [1, 2, 3])
    except:
        wnli[path] = parse_tsv(path)

In [115]:
wnli[dic["WNLI"][1]]

[['index', 'sentence1', 'sentence2'],
 ['0',
  'Maude and Dora had seen the trains rushing across the prairie, with long, rolling puffs of black smoke streaming back from the engine. Their roars and their wild, clear whistles could be heard from far away. Horses ran away when they came in sight.',
  'Horses ran away when Maude and Dora came in sight.'],
 ['1',
  'Maude and Dora had seen the trains rushing across the prairie, with long, rolling puffs of black smoke streaming back from the engine. Their roars and their wild, clear whistles could be heard from far away. Horses ran away when they came in sight.',
  'Horses ran away when the trains came in sight.'],
 ['2',
  'Maude and Dora had seen the trains rushing across the prairie, with long, rolling puffs of black smoke streaming back from the engine. Their roars and their wild, clear whistles could be heard from far away. Horses ran away when they came in sight.',
  'Horses ran away when the puffs came in sight.'],
 ['3',
  'Maude a

In [116]:
wnli['./data/WNLI/test.tsv'] = [[l[1], l[2]] for l in wnli['./data/WNLI/test.tsv']]

pd.DataFrame(wnli["./data/WNLI/train.tsv"][1:]).to_csv("./preprocessed/WNLI_train.tsv", sep="\t")
pd.DataFrame(wnli["./data/WNLI/dev.tsv"][1:]).to_csv("./preprocessed/WNLI_dev.tsv", sep="\t")
pd.DataFrame(wnli["./data/WNLI/test.tsv"][1:]).to_csv("./preprocessed/WNLI_test.tsv", sep="\t")

# Preprocessing STSB

In [117]:
dic["STSB"]

['./data/STSB/test.tsv', './data/STSB/train.tsv', './data/STSB/dev.tsv']

In [120]:
stsb={}
for path in dic["STSB"]:
    try:
        stsb[path] = parse_tsv(path, [7, 8, 9])
    except:
        stsb[path] = parse_tsv(path, [7, 8])

In [122]:
stsb[dic["STSB"][1]]

[['sentence1', 'sentence2', 'score'],
 ['A plane is taking off.', 'An air plane is taking off.', '5.000'],
 ['A man is playing a large flute.', 'A man is playing a flute.', '3.800'],
 ['A man is spreading shreded cheese on a pizza.',
  'A man is spreading shredded cheese on an uncooked pizza.',
  '3.800'],
 ['Three men are playing chess.', 'Two men are playing chess.', '2.600'],
 ['A man is playing the cello.',
  'A man seated is playing the cello.',
  '4.250'],
 ['Some men are fighting.', 'Two men are fighting.', '4.250'],
 ['A man is smoking.', 'A man is skating.', '0.500'],
 ['The man is playing the piano.', 'The man is playing the guitar.', '1.600'],
 ['A man is playing on a guitar and singing.',
  'A woman is playing an acoustic guitar and singing.',
  '2.200'],
 ['A person is throwing a cat on to the ceiling.',
  'A person throws a cat on the ceiling.',
  '5.000'],
 ['The man hit the other man with a stick.',
  'The man spanked the other man with a stick.',
  '4.200'],
 ['A woman

In [124]:
stsb['./data/STSB/train.tsv'] = [[l[0], l[1], float(l[2])] for l in stsb['./data/STSB/train.tsv'][1:]]
stsb['./data/STSB/dev.tsv'] = [[l[0], l[1], float(l[2])] for l in stsb['./data/STSB/dev.tsv'][1:]]

pd.DataFrame(stsb["./data/STSB/train.tsv"][1:]).to_csv("./preprocessed/STSB_train.tsv", sep="\t")
pd.DataFrame(stsb["./data/STSB/dev.tsv"][1:]).to_csv("./preprocessed/STSB_dev.tsv", sep="\t")
pd.DataFrame(stsb["./data/STSB/test.tsv"][1:]).to_csv("./preprocessed/STSB_test.tsv", sep="\t")

# Preprocessing QQP

In [125]:
dic["QQP"]

['./data/QQP/dev.tsv', './data/QQP/test.tsv', './data/QQP/train.tsv']

In [135]:
qqp={}
for path in dic["QQP"]:
    print(path)
    try:
        qqp[path] = parse_tsv(path, [3, 4, 5])[1:]
    except:
        qqp[path] = parse_tsv(path, [1, 2])[1:]

./data/QQP/dev.tsv
./data/QQP/test.tsv
./data/QQP/train.tsv


In [141]:
qqp[dic["QQP"][2]]

[['How is the life of a math student? Could you describe your own experiences?',
  'Which level of prepration is enough for the exam jlpt5?',
  '0'],
 ['How do I control my horny emotions?',
  'How do you control your horniness?',
  '1'],
 ['What causes stool color to change to yellow?',
  'What can cause stool to come out as little balls?',
  '0'],
 ['What can one do after MBBS?', 'What do i do after my MBBS ?', '1'],
 ['Where can I find a power outlet for my laptop at Melbourne Airport?',
  'Would a second airport in Sydney, Australia be needed if a high-speed rail link was created between Melbourne and Sydney?',
  '0'],
 ["How not to feel guilty since I am Muslim and I'm conscious we won't have sex together?",
  "I don't beleive I am bulimic, but I force throw up atleast once a day after I eat something and feel guilty. Should I tell somebody, and if so who?",
  '0'],
 ['How is air traffic controlled?',
  'How do you become an air traffic controller?',
  '0'],
 ['What is the best se

In [142]:
pd.DataFrame(qqp["./data/QQP/train.tsv"][1:]).to_csv("./preprocessed/QQP_train.tsv", sep="\t")
pd.DataFrame(qqp["./data/QQP/dev.tsv"][1:]).to_csv("./preprocessed/QQP_dev.tsv", sep="\t")
pd.DataFrame(qqp["./data/QQP/test.tsv"][1:]).to_csv("./preprocessed/QQP_test.tsv", sep="\t")

# Preprocessing RTE

In [143]:
dic["RTE"]

['./data/RTE/train.tsv', './data/RTE/test.tsv', './data/RTE/dev.tsv']

In [152]:
print(dic["RTE"][2])
parse_tsv(dic["RTE"][2])

./data/RTE/dev.tsv


[['index', 'sentence1', 'sentence2', 'label'],
 ['0',
  'Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.',
  'Christopher Reeve had an accident.',
  'not_entailment'],
 ['1',
  'Yet, we now are discovering that antibiotics are losing their effectiveness against illness. Disease-causing bacteria are mutating faster than we can come up with new antibiotics to fight the new variations.',
  'Bacteria is winning the war against antibiotics.',
  'entailment'],
 ['2',
  'Cairo is now home to some 15 million people - a burgeoning population that produces approximately 10,000 tonnes of rubbish per day, putting an enormous strain on public services. In the past 10 years, the government has tried hard to encourage private investment in the refuse sector, but some estimate 4,000 tonnes of waste is left behind every day, festering in the heat as it waits for someone to clear it up. It is often the people in the 

In [153]:
rte={}
for path in dic["RTE"]:
    print(path)
    try:
        rte[path] = parse_tsv(path, [1, 2, 3])[1:]
    except:
        rte[path] = parse_tsv(path, [1, 2])[1:]

./data/RTE/train.tsv
./data/RTE/test.tsv
./data/RTE/dev.tsv


In [156]:
rte['./data/RTE/train.tsv'] = [[l[0], l[1], 0] if l[2] == "not_entailment" else [l[0], l[1], 1] for l in rte['./data/RTE/train.tsv'][1:]]
rte['./data/RTE/dev.tsv'] = [[l[0], l[1], 0] if l[2] == "not_entailment" else [l[0], l[1], 1] for l in rte['./data/RTE/dev.tsv'][1:]]

pd.DataFrame(rte["./data/RTE/train.tsv"][1:]).to_csv("./preprocessed/RTE_train.tsv", sep="\t")
pd.DataFrame(rte["./data/RTE/dev.tsv"][1:]).to_csv("./preprocessed/RTE_dev.tsv", sep="\t")
pd.DataFrame(rte["./data/RTE/test.tsv"][1:]).to_csv("./preprocessed/RTE_test.tsv", sep="\t")

# Preprocessing QNLI

In [157]:
dic["QNLI"]

['./data/QNLI/dev.tsv', './data/QNLI/test.tsv', './data/QNLI/train.tsv']

In [158]:
print(dic["QNLI"][0])
parse_tsv(dic["QNLI"][0])

./data/QNLI/dev.tsv


[['index', 'question', 'sentence', 'label'],
 ['0',
  'What came into force after the new constitution was herald?',
  'As of that day, the new constitution heralding the Second Republic came into force.',
  'entailment'],
 ['1',
  'What is the first major city in the stream of the Rhine?',
  'The most important tributaries in this area are the Ill below of Strasbourg, the Neckar in Mannheim and the Main across from Mainz.',
  'not_entailment'],
 ['2',
  'What is the minimum required if you want to teach in Canada?',
  "In most provinces a second Bachelor's Degree such as a Bachelor of Education is required to become a qualified teacher.",
  'not_entailment'],
 ['3',
  "How was Temüjin kept imprisoned by the Tayichi'ud?",
  "The Tayichi'ud enslaved Temüjin (reportedly with a cangue, a sort of portable stocks), but with the help of a sympathetic guard, the father of Chilaun (who later became a general of Genghis Khan), he was able to escape from the ger (yurt) in the middle of the night

In [159]:
qnli={}
for path in dic["QNLI"]:
    print(path)
    try:
        qnli[path] = parse_tsv(path, [1, 2, 3])[1:]
    except:
        qnli[path] = parse_tsv(path, [1, 2])[1:]

./data/QNLI/dev.tsv
./data/QNLI/test.tsv
./data/QNLI/train.tsv


In [160]:
qnli['./data/QNLI/train.tsv'] = [[l[0], l[1], 0] if l[2] == "not_entailment" else [l[0], l[1], 1] for l in qnli['./data/QNLI/train.tsv'][1:]]
qnli['./data/QNLI/dev.tsv'] = [[l[0], l[1], 0] if l[2] == "not_entailment" else [l[0], l[1], 1] for l in qnli['./data/QNLI/dev.tsv'][1:]]

pd.DataFrame(qnli["./data/QNLI/train.tsv"][1:]).to_csv("./preprocessed/QNLI_train.tsv", sep="\t")
pd.DataFrame(qnli["./data/QNLI/dev.tsv"][1:]).to_csv("./preprocessed/QNLI_dev.tsv", sep="\t")
pd.DataFrame(qnli["./data/QNLI/test.tsv"][1:]).to_csv("./preprocessed/QNLI_test.tsv", sep="\t")

# Preprocessing MNLI

In [161]:
dic["MNLI"]

['./data/MNLI/train.tsv']

In [162]:
print(dic["MNLI"][0])
parse_tsv(dic["MNLI"][0])

./data/MNLI/train.tsv


[['index',
  'promptID',
  'pairID',
  'genre',
  'sentence1_binary_parse',
  'sentence2_binary_parse',
  'sentence1_parse',
  'sentence2_parse',
  'sentence1',
  'sentence2',
  'label1',
  'gold_label'],
 ['0',
  '31193',
  '31193n',
  'government',
  '( ( Conceptually ( cream skimming ) ) ( ( has ( ( ( two ( basic dimensions ) ) - ) ( ( product and ) geography ) ) ) . ) )',
  '( ( ( Product and ) geography ) ( ( are ( what ( make ( cream ( skimming work ) ) ) ) ) . ) )',
  '(ROOT (S (NP (JJ Conceptually) (NN cream) (NN skimming)) (VP (VBZ has) (NP (NP (CD two) (JJ basic) (NNS dimensions)) (: -) (NP (NN product) (CC and) (NN geography)))) (. .)))',
  '(ROOT (S (NP (NN Product) (CC and) (NN geography)) (VP (VBP are) (SBAR (WHNP (WP what)) (S (VP (VBP make) (NP (NP (NN cream)) (VP (VBG skimming) (NP (NN work)))))))) (. .)))',
  'Conceptually cream skimming has two basic dimensions - product and geography.',
  'Product and geography are what make cream skimming work. ',
  'neutral',
  'n

In [None]:
mnli={}
for path in dic["MNLI"]:
    print(path)
    try:
        mnli[path] = parse_tsv(path, [1, 2, 3])[1:]
    except:
        mnli[path] = parse_tsv(path, [1, 2])[1:]

# Preprocessing MRPC

In [163]:
dic["MRPC"]

['./data/MRPC/test.tsv']

In [None]:
mrpc={}
for path in dic["MNLI"]:
    print(path)
    try:
        mnli[path] = parse_tsv(path, [1, 2, 3])[1:]
    except:
        mnli[path] = parse_tsv(path, [1, 2])[1:]

# CoLA with MCC

In [5]:
import re

def parse_tsv(path, col_nos=[]):
    if col_nos == []:
        return [re.split("\t", re.sub("\n", "", l)) for l in open(path).readlines()]
    else:
        data = [re.split("\t", re.sub("\n", "", l)) for l in open(path).readlines()]
        
        output = []
        for l in data:
            s=[]
            for c in col_nos:
                s.append(l[c])
            output.append(s)
        return output

In [2]:
#train_dataset = parse_tsv("./preprocessed/CoLA_train.tsv", col_nos=[1,2])[1:]
#test_dataset = parse_tsv("./preprocessed/CoLA_dev.tsv", col_nos=[1,2])[1:]
train_dataset = parse_tsv("../Datasets/Modu/국립국어원 문법성 판단 말뭉치(버전 1.0)/NIKL_CoLA_in_domain_train.tsv", col_nos=[3,1])[1:]
test_dataset = parse_tsv("../Datasets/Modu/국립국어원 문법성 판단 말뭉치(버전 1.0)/NIKL_CoLA_in_domain_dev.tsv", col_nos=[3,1])[1:]

In [8]:
train_dataset = parse_tsv("../Datasets/Modu/국립국어원 문법성 판단 말뭉치(버전 1.0)/NIKL_CoLA_in_domain_train.tsv", col_nos=[3,1])[1:]
train_dataset[:100]

[['높은 달이 떴다.', '1'],
 ['달이 뜸이 높았다.', '0'],
 ['실없는 사람이 까불까불한다.', '1'],
 ['나는 철수에게 공을 던졌다.', '1'],
 ['내가 순이와 둘이서 다툰다.', '1'],
 ['내가 순이와 우리가 다툰다.', '0'],
 ['나는 부지런히 뛰었다.', '1'],
 ['나는 부지런히 뛰어졌다.', '0'],
 ['사랑이 죄는 아니다.', '1'],
 ['죄는 사랑이 아니다.', '0'],
 ['철수는 영미를 가면적으로 미워하였다.', '1'],
 ['철수에게는 영미가 가면적으로 미웠다.', '0'],
 ['그가 그녀에게 감기를 옮겼다.', '1'],
 ['그녀가 그에 의해서 감기를 옮겨졌다.', '0'],
 ['어린이가 많이 살았다.', '1'],
 ['어린이가 살음이 많았다.', '0'],
 ['영희가 나에게 좋다.', '1'],
 ['영희가 나에게 좋아를 한다.', '0'],
 ['고양이가 꽃밭을 파헤쳐 놓았다.', '1'],
 ['고양이가 꽃밭을 파헤쳐 두었다.', '0'],
 ['영수가 밥을 먹고 나서 철수가 밥을 먹겠다.', '1'],
 ['영수가 밥을 먹었고서 철수가 밥을 먹겠다.', '0'],
 ['우리는 열심히 공부해야 한다.', '1'],
 ['우리는 열심히 공부해야 하자.', '0'],
 ['나는 "이젠 살았구나"하고 굴 밖으로 나왔다.', '1'],
 ['나는 "이젠 살았구나"라고 굴 밖으로 나왔다.', '0'],
 ['영호가 더워한다.', '1'],
 ['방이 더워한다.', '0'],
 ['철수는 뱀이 징그럽다고 말했다.', '1'],
 ['철수가 나는 뱀이 징그럽다고 말했다.', '0'],
 ['체온이 싸늘하니 식어 있다.', '1'],
 ['체온이 식음이 싸늘했다.', '0'],
 ['나는 그 말이 모순이 된다고 생각한다.', '1'],
 ['나는 모순을 그 말이 된다고 생각한다.', '0'],
 ['복희가 결혼했다고 소문이 떠돈다.', '1'],
 ['복희가 결혼했다고 소문이 있다.',

In [3]:
test_dataset

[['실없는 사람이 까불한다.', '0'],
 ['사람은 언제나 젊는 수는 없다.', '0'],
 ['그가 팔을 곧게 뻗는다.', '1'],
 ['철수가 자.', '1'],
 ['마음이 든든을 한데 개운치는 못하다.', '0'],
 ['경찰은 나를 살인범으로 취급했다.', '1'],
 ['영수가 철수한테 같이하였다.', '0'],
 ['버스와 화물차가 막 부딪쳤다.', '1'],
 ['순이가 노래는 부르고는 춤은 추지 않는다.', '1'],
 ['창문 밖으로 빨리 내다 보아라.', '1'],
 ['철수는 모자를 쓰고 간다.', '1'],
 ['그 건물은 왼쪽 벽을 많은 유리창을 하였다.', '0'],
 ['나는 그의 결정이 옳았다고 하는 견해를 가지고 있다.', '1'],
 ['철수는 종소리가 더 잘 들리더라.', '0'],
 ['네가 더워하느냐?', '0'],
 ['나에게는 먼 산이 보이고 있다.', '0'],
 ['그 여자는 절대로 오지 않았다.', '1'],
 ['그 여자는 절대로 왔다.', '0'],
 ['그가 살아 있다는 소문이 나아 돌았다.', '0'],
 ['철수가 우체국에 가서 편지를 부쳤다.', '1'],
 ['아이가 길에 서고 운다.', '0'],
 ['날씨가 추워진다.', '1'],
 ['어머니는 석 달된 아이에게 옷을 입게 했다.', '0'],
 ['책은 내가 읽었는데, 잃어버렸다.', '0'],
 ['나만 안 가고 다른 사람도 갔다.', '0'],
 ['질적인 문제를 고려할 수를 없다.', '0'],
 ['철수는 학교에 갔겠다.', '1'],
 ['네가 손이 크더라.', '1'],
 ['젊은 친구, 그런 염세적인 말은 집어치게.', '1'],
 ['극장은 매일 관객으로 가득 찼다.', '1'],
 ['팔다리가 쑤시는 것을 보니 비가 오겠다.', '1'],
 ['철수는 아직도 그 이야기를 믿고 있다.', '1'],
 ['선생은 가르쳤고, 학생은 배웠다.', '0'],
 ['그는 서울에 이르렀다.', '1'],
 ['밥도 좋고 빵도 좋고 다

In [4]:
from torch.utils.data import Dataset, DataLoader
import torch

class Dataset_for_CoLA(Dataset):
    def __init__(self, dataset, tokenizer, test=False, seq_len=512):
        self.dataset = dataset
        self.line_num = len(self.dataset)
        self.seq_len = seq_len
        self.test=test
        
        self.tokenizer = tokenizer
        self.vocab_len = len(tokenizer.vocab)
        self.mask = self.tokenizer.convert_tokens_to_ids(["[MASK]"])[0]
        self.cls = tokenizer.convert_tokens_to_ids(["[CLS]"])[0]
        self.sep = tokenizer.convert_tokens_to_ids(["[SEP]"])[0]
        self.pad = tokenizer.convert_tokens_to_ids(["[PAD]"])[0]
        
    def __len__(self):
        return self.line_num
    
    def __getitem__(self, item):
        if not self.test:
            sent, lbl = self.dataset[item]
        else:
            sent = self.dataset[item][0]
            
        input_ids = [self.cls] + self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(sent)) + [self.sep]

        pad_len = self.seq_len - len(input_ids)
        
        if pad_len >= 0:
            attention_mask = [1] * len(input_ids) + [0] * pad_len
            input_ids = input_ids + [self.pad] * pad_len
            
        else:
            input_ids = input_ids[:self.seq_len]
            attention_mask = [1] * len(input_ids)
        
        try:
            assert len(input_ids) == len(attention_mask)
        except AssertionError:
            print(item)
            print(len(input_ids))
            print(len(attention_mask))
            raise
            
        if not self.test:
            output = {"input_ids": input_ids,
                      "attention_masks": attention_mask,
                      "CoLA_label": int(lbl)
                     }
        else:
            output = {"input_ids": input_ids,
                      "attention_masks": attention_mask
                     }
            
        return {key: torch.tensor(value, dtype=torch.long) for key, value in output.items()}

In [5]:
from pytorch_pretrained_bert import BertTokenizer
#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("../KUMC/KRBERT/vocabs/vocab_snu_char16424.txt")

In [6]:
train_dataloader = DataLoader(Dataset_for_CoLA(train_dataset, tokenizer, seq_len=30), batch_size= 20)
test_dataloader = DataLoader(Dataset_for_CoLA(test_dataset, tokenizer, seq_len=30), batch_size= 1)

In [7]:
from pytorch_pretrained_bert import BertModel
#model = BertModel.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("../KUMC/KRBERT/krbert_pytorch/pretrained/model_ranked_krbert/")

In [8]:
# from pytorch-bert

class ScheduledOptim():
    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

from torch.optim import Adam

optim = Adam(model.parameters(), lr=float(1e-4), betas=(0.9, 0.999), weight_decay=0.01)
optim_schedule = ScheduledOptim(optim, 768, n_warmup_steps=10000)

In [9]:
import torch.nn as nn

class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()
        self.classifier = nn.Linear(768, 2)

    def forward(self, pooled_output):
        cls_score = self.classifier(pooled_output)
        return cls_score

In [10]:
from torch.nn import CrossEntropyLoss
from statistics import mean
from tqdm.notebook import tqdm

def mean(l):
    return sum(l) / len(l)

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model.to(device)
bin_cls = BinaryClassifier().to(device)
loss_fct=CrossEntropyLoss()
epoch=1

losses=[]
predictions=[]
true_labels=[]

with tqdm(epoch, leave=False, bar_format="TRAIN: [{elapsed}=>{remaining}] {desc}") as tq:
    for e in range(epoch):
        tq.update()
        
        # Train
        for i, data in enumerate(train_dataloader):
            torch.cuda.empty_cache()
            data = {key: value.to(device) for key, value in data.items()}
            _, output = model(input_ids=data["input_ids"], attention_mask=data["attention_masks"],
                              output_all_encoded_layers=False)
            output = bin_cls(output)
            
            loss=loss_fct(output.view(-1, 2), data["CoLA_label"].view(-1))
            
            optim_schedule.zero_grad()
            loss.mean().backward()
            optim_schedule.step_and_update_lr()
            losses.append(loss.detach().cpu().numpy())
            
            output = output.detach().cpu().numpy()
            labels = data["CoLA_label"].to("cpu").numpy()

            predictions.append(output)
            true_labels.append(labels)
            
            tq.set_description_str("{:,}/{:,} | ave_loss:{}".format(i,
                                   len(train_dataloader), mean(losses)))  
            
            data = {key: value.to("cpu") for key, value in data.items()}
            del data
            torch.cuda.empty_cache()


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [15]:
train_dataloader = None

In [17]:
predictions=[]
true_labels=[]
losses_val=[]

with tqdm(test_dataloader, leave=False, bar_format="TEST: {bar}{percentage:2.2f}% [{elapsed}=>{remaining}] | {desc}") as tq:
    # Validation
    for i, data in enumerate(test_dataloader):
        tq.update()

        torch.cuda.empty_cache()
        data = {key: value.to(device) for key, value in data.items()}
        _, output = model(input_ids=data["input_ids"], attention_mask=data["attention_masks"],
                          output_all_encoded_layers=False)

        output = bin_cls(output)
        output = output.detach().cpu().numpy()
        labels = data["CoLA_label"].to("cpu").numpy()

        predictions.append(output)
        true_labels.append(labels)
        
        tq.set_description_str("{:,}/{:,}".format(i,
                               len(test_dataloader)))
        
        data = {key: value.to("cpu") for key, value in data.items()}
        del data
        torch.cuda.empty_cache()

HBox(children=(FloatProgress(value=0.0, max=1060.0), HTML(value='')))

In [18]:
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef

matthews_set = []

for i in range(len(true_labels)):
    # The predictions for this batch are a 2-column ndarray (one column for "0" 
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s.
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()

    # Calculate and store the coef for this batch.  
    matthews = mcc(true_labels[i], pred_labels_i)                
    matthews_set.append(matthews)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [19]:
# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('Total MCC: %.3f' % mcc)

Total MCC: -0.032


# STS-B FineTuning Test

In [5]:
from transformers import BertTokenizer, AutoTokenizer
from transformers import BertForSequenceClassification


tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-BERT-char16424")
model = BertForSequenceClassification.from_pretrained("snunlp/KR-BERT-char16424")

Some weights of the model checkpoint at snunlp/KR-BERT-char16424 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkp

In [6]:
import pandas as pd
pd.read_excel("../KUMC/sts/MedSTS_Revised.xlsx")

Unnamed: 0,IDX,TYPE,YEAR,S1,S2,SIM
0,1,train,2018,Insulin NPH Human [NOVOLIN N] 100 unit/mL susp...,Insulin NPH Human [NOVOLIN N] 100 unit/mL susp...,3.50
1,2,train,2018,"Patient arrives ambulatory, Gait steady, Histo...","Complex assessment performed, Patient arrives ...",2.50
2,3,train,2018,"Peripheral IV site, established in the right f...","Peripheral IV site, present prior to arrival, ...",3.45
3,4,train,2018,No: new confusion or inability to stay alert a...,No: new confusion or inability to stay alert a...,4.00
4,5,train,2018,Spent 15 minutes with the patient and greater ...,"Nurse visit ten minutes, over half of which wa...",3.00
...,...,...,...,...,...,...
3117,3118,test,2019,The patient was instructed in how to safely do...,The patient tolerated the procedure well and w...,1.00
3118,3119,test,2019,Heart: Regular rate and rhythm S1-S2 without ...,"Heart: Regular rate and rhythm, no murmurs, r...",4.00
3119,3120,test,2019,The patient was transferred to the Title.,The plan was discussed with patient/family and...,1.00
3120,3121,test,2019,The bladder was drained and a weighted speculu...,The needle was then removed and a sterile dres...,0.00


In [12]:
data = [re.split("\t", re.sub("\n", "", l)) for l in open("./preprocessed/sts.tsv").readlines()][1:]



In [13]:
for i, s1, s2, lbl in data:
    print(i)
    print(s1)
    print(s2)
    print(lbl)
    break

0
A plane is taking off.
An air plane is taking off.
5.0


In [6]:
sts_val = [l.split("\t") for l in open("./data/STSB/dev.tsv").readlines()]
sts_col = sts_val[0]
sts_val = sts_val[1:]

sts_val = pd.DataFrame(sts_train, columns=sts_col)

sent1_val = list(sts_val["sentence1"])
sent2_val = list(sts_val["sentence2"])
score_val = list(sts_val["score\n"])

In [8]:
from torch.cuda import is_available
from tqdm.notebook import tqdm
from statistics import mean
import torch

device = "cuda:0" if is_available() else "cpu"
print(device)

seq_len = 50

cls = [tokenizer.vocab.get("[CLS]")]
sep = [tokenizer.vocab.get("[SEP]")]
pad = [tokenizer.vocab.get("[PAD]")]

batch=10

input_ids=[]
seq_labels=[]
attention_mask=[]
labels=[]
epoch=2
log_interval = 100
path="./models/"

model.to(device)
task='STS'

for e in range(epoch):
    train_loss_list = []
    train_acc_list = []
    val_loss_list = []
    val_acc_list = []
    
    mode='TRAIN'
    train_iter = tqdm(enumerate(zip(sent1, sent2, score)),
                    desc="Epoch_%s:%d" % (mode, e+1),
                    total=len(score),
                    bar_format="{l_bar}{r_bar}")
    # TRAINING
    for i, (s1, s2, lbl) in train_iter:
        
        s1 = tokenizer.tokenize(s1)
        s2 = tokenizer.tokenize(s2)

        s1 = cls + tokenizer.convert_tokens_to_ids(s1) + sep
        l1 = [0] * len(s1)
        s2 = tokenizer.convert_tokens_to_ids(s2) + sep
        l2 = [0] * len(s2)

        s = (s1 + s2)[:seq_len]
        l = (l1 + l2)[:seq_len]

        a = [1]*len(s) + pad*(seq_len-len(s))
        s = s + pad*(seq_len-len(s))
        l = l + pad*(seq_len-len(l))

        input_ids.append(s)
        seq_labels.append(l)
        attention_mask.append(a)
        labels.append(round(float(lbl)))

        if len(input_ids) == batch:
            input_ids = torch.tensor(input_ids, dtype=torch.long).to(device)
            seq_labels = torch.tensor(seq_labels, dtype=torch.long).to(device)
            attention_mask = torch.tensor(attention_mask, dtype=torch.long).to(device)
            labels = torch.tensor(labels, dtype=torch.long).to(device)


            losses = model(input_ids=input_ids, attention_mask=attention_mask,
                  token_type_ids=seq_labels, labels=labels)
            logits = model(input_ids=input_ids, attention_mask=attention_mask,
                  token_type_ids=seq_labels)

            losses.sum().backward()

            maxs=[]
            for logit in logits.tolist():
                maxs.append(logit.index(max(logit)))
            
            acc = [1 for p, a in zip(maxs, labels) if p == a]
            acc = round(len(acc)/ len(labels) * 100, 2)
            loss = round(float(losses.sum()), 2)
            
            train_loss_list.append(loss)
            train_acc_list.append(acc)
            
            input_ids=[]
            seq_labels=[]
            attention_mask=[]
            labels=[]
            
            if i % log_interval == 0:
                log = "[{}] {}:: epoch: {} | iteration: {} / {} ({}%) | accuracy: {} | loss: {}".format(
                            task, mode, e+1, i, len(score), round(i/len(score)*100,2), acc, loss)
                open("./log.txt", "a").write(log)
                train_iter.write(log)

    
    mode='VALIDATION'
    val_iter = tqdm(enumerate(zip(sent1_val, sent2_val, score_val)),
                desc="Epoch_%s:%d" % (mode, e+1),
                total=len(score_val),
                bar_format="{l_bar}{r_bar}")
    # VALIDATION
    for i, (s1, s2, lbl) in val_iter:
        
        s1 = tokenizer.tokenize(s1)
        s2 = tokenizer.tokenize(s2)

        s1 = cls + tokenizer.convert_tokens_to_ids(s1) + sep
        l1 = [0] * len(s1)
        s2 = tokenizer.convert_tokens_to_ids(s2) + sep
        l2 = [0] * len(s2)

        s = (s1 + s2)[:seq_len]
        l = (l1 + l2)[:seq_len]

        a = [1]*len(s) + pad*(seq_len-len(s))
        s = s + pad*(seq_len-len(s))
        l = l + pad*(seq_len-len(l))

        input_ids.append(s)
        seq_labels.append(l)
        attention_mask.append(a)
        labels.append(round(float(lbl)))

        if len(input_ids) == batch:
            input_ids = torch.tensor(input_ids, dtype=torch.long).to(device)
            seq_labels = torch.tensor(seq_labels, dtype=torch.long).to(device)
            attention_mask = torch.tensor(attention_mask, dtype=torch.long).to(device)
            labels = torch.tensor(labels, dtype=torch.long).to(device)


            losses = model(input_ids=input_ids, attention_mask=attention_mask,
                  token_type_ids=seq_labels, labels=labels)
            logits = model(input_ids=input_ids, attention_mask=attention_mask,
                  token_type_ids=seq_labels)

            maxs=[]
            for logit in logits.tolist():
                maxs.append(logit.index(max(logit)))
            
            acc = [1 for p, a in zip(maxs, labels) if p == a]
            acc = round(len(acc)/ len(labels) * 100, 2)
            loss = round(float(losses.sum()), 2)
            
            val_loss_list.append(loss)
            val_acc_list.append(acc)
                        
            input_ids=[]
            seq_labels=[]
            attention_mask=[]
            labels=[]
            
            if i % log_interval == 0:
                log = "[{}] {}:: epoch: {} | iteration: {} / {} ({}%) | accuracy: {} | loss: {}".format(
                            task, mode, e+1, i, len(score_val), round(i/len(score_val)*100,2), acc, loss)
                open("./log.txt", "a").write(log)
                val_iter.write(log)
                
        # SAVING MODEL
        name = task + "_bert_base_e"+str(e+1)
        torch.save(model, path + name+".model")
        open(path+name+".log", "w").write("Average loss: {} | Average accuracy: {}".format(
            str(round(mean(val_loss_list), 2)),
            str(round(mean(val_acc_list), 2))))

cuda:0


HBox(children=(HTML(value='Epoch_TRAIN:1'), FloatProgress(value=0.0, max=5749.0), HTML(value='')))




HBox(children=(HTML(value='Epoch_VALIDATION:1'), FloatProgress(value=0.0, max=5749.0), HTML(value='')))

[STS] VALIDATION:: epoch: 1 | iteration: 0 / 5749 (0.0%) | accuracy: 0.0 | loss: 1.95
[STS] VALIDATION:: epoch: 1 | iteration: 100 / 5749 (1.74%) | accuracy: 20.0 | loss: 1.9
[STS] VALIDATION:: epoch: 1 | iteration: 200 / 5749 (3.48%) | accuracy: 0.0 | loss: 2.06
[STS] VALIDATION:: epoch: 1 | iteration: 300 / 5749 (5.22%) | accuracy: 0.0 | loss: 1.96
[STS] VALIDATION:: epoch: 1 | iteration: 400 / 5749 (6.96%) | accuracy: 10.0 | loss: 1.92
[STS] VALIDATION:: epoch: 1 | iteration: 500 / 5749 (8.7%) | accuracy: 20.0 | loss: 1.93
[STS] VALIDATION:: epoch: 1 | iteration: 600 / 5749 (10.44%) | accuracy: 0.0 | loss: 1.91
[STS] VALIDATION:: epoch: 1 | iteration: 700 / 5749 (12.18%) | accuracy: 0.0 | loss: 2.11
[STS] VALIDATION:: epoch: 1 | iteration: 800 / 5749 (13.92%) | accuracy: 0.0 | loss: 1.98
[STS] VALIDATION:: epoch: 1 | iteration: 900 / 5749 (15.65%) | accuracy: 10.0 | loss: 1.87
[STS] VALIDATION:: epoch: 1 | iteration: 1000 / 5749 (17.39%) | accuracy: 20.0 | loss: 1.88
[STS] VALIDATIO

HBox(children=(HTML(value='Epoch_TRAIN:2'), FloatProgress(value=0.0, max=5749.0), HTML(value='')))




HBox(children=(HTML(value='Epoch_VALIDATION:2'), FloatProgress(value=0.0, max=5749.0), HTML(value='')))




StatisticsError: mean requires at least one data point

In [88]:
name = task + "_bert_base_e"+str(e+1)
torch.save(model, path + name+".model")
open(path+name+".log", "w").write("Average loss: {} | Average accuracy: {}".format(
    str(round(mean(val_loss_list), 2)),
    str(round(mean(val_acc_list), 2))))

37

In [None]:
import torch
model.forward(torch.tensor([[3, 20, 421, 22, 42, 11, 4], [3, 20, 421, 22, 42, 11, 4]], dtype=torch.long))

In [8]:
import re
scores = list(sts["score\n"])
scores = [float(re.sub("\n", "", s)) for s in scores]

In [9]:
scores

[5.0,
 3.8,
 3.8,
 2.6,
 4.25,
 4.25,
 0.5,
 1.6,
 2.2,
 5.0,
 4.2,
 4.6,
 3.867,
 4.667,
 1.667,
 3.75,
 5.0,
 0.5,
 3.8,
 5.0,
 3.2,
 2.8,
 4.6,
 3.0,
 5.0,
 4.8,
 5.0,
 4.2,
 4.2,
 4.0,
 4.0,
 4.909,
 3.0,
 2.4,
 4.2,
 3.4,
 5.0,
 3.75,
 2.75,
 5.0,
 4.0,
 3.6,
 1.6,
 1.75,
 5.0,
 1.0,
 1.0,
 2.375,
 3.8,
 3.2,
 3.2,
 4.4,
 3.75,
 4.75,
 3.2,
 1.556,
 3.938,
 5.0,
 5.0,
 4.0,
 1.6,
 4.75,
 3.5,
 1.4,
 1.4,
 4.0,
 5.0,
 3.833,
 0.6,
 2.917,
 4.2,
 2.0,
 2.6,
 1.6,
 2.0,
 4.2,
 2.0,
 4.8,
 4.4,
 5.0,
 3.0,
 4.25,
 4.25,
 3.8,
 2.4,
 1.6,
 2.0,
 1.6,
 4.0,
 2.2,
 4.4,
 3.6,
 3.6,
 0.5,
 0.8,
 0.6,
 2.6,
 2.0,
 2.2,
 2.4,
 3.6,
 2.2,
 4.8,
 1.643,
 1.75,
 2.25,
 4.0,
 4.8,
 3.2,
 4.0,
 4.4,
 4.6,
 3.8,
 4.8,
 4.857,
 5.0,
 2.533,
 1.0,
 1.0,
 2.0,
 0.143,
 2.0,
 1.6,
 1.6,
 3.4,
 4.0,
 4.8,
 2.5,
 1.75,
 1.0,
 5.0,
 1.4,
 4.0,
 3.8,
 4.0,
 4.0,
 4.8,
 0.6,
 4.75,
 2.2,
 3.0,
 0.0,
 2.2,
 0.4,
 4.8,
 4.8,
 3.8,
 3.0,
 4.0,
 5.0,
 3.8,
 3.0,
 4.4,
 3.8,
 3.0,
 0.667,
 4.0,
 3.75,
 4.133,


## MRPC

In [29]:
mrpc_train = [l.split("\n")[0] for l in open("./data/MRPC/msr_paraphrase_train.txt").readlines()]
mrpc_train = [l.split("\t") for l in mrpc_train]
mrpc_col = mrpc_train[0]
mrpc_train = mrpc_train[1:]

In [31]:
import pandas as pd
mrpc = pd.DataFrame(mrpc_train, columns=mrpc_col)
mrpc

Unnamed: 0,﻿Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother , whom he called "" ...","Referring to him as only "" the witness "" , Amr..."
1,0,2108705,2108831,Yucaipa owned Dominick 's before selling the c...,Yucaipa bought Dominick 's in 1995 for $ 693 m...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10 , the ship 's owners had published ..."
3,0,3344667,3344648,"Around 0335 GMT , Tab shares were up 19 cents ...","Tab shares jumped 20 cents , or 4.6 % , to set..."
4,1,1236820,1236712,"The stock rose $ 2.11 , or about 11 percent , ...",PG & E Corp. shares jumped $ 1.63 or 8 percent...
...,...,...,...,...,...
4071,1,1620264,1620507,""" At this point , Mr. Brando announced : ' Som...","Brando said that "" somebody ought to put a bul..."
4072,0,1848001,1848224,"Martin , 58 , will be freed today after servin...",Martin served two thirds of a five-year senten...
4073,1,747160,747144,""" We have concluded that the outlook for price...","In a statement , the ECB said the outlook for ..."
4074,1,2539933,2539850,The notification was first reported Friday by ...,MSNBC.com first reported the CIA request on Fr...


## QQP

In [35]:
qqp_train = [l.split("\n")[0] for l in open("./data/QQP/train.tsv").readlines()]
qqp_train = [l.split("\t") for l in qqp_train]
qqp_col = qqp_train[0]
qqp_train = qqp_train[1:]

In [37]:
import pandas as pd
qqp = pd.DataFrame(qqp_train, columns=qqp_col)
qqp

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,402555,536040,536041,How do I control my horny emotions?,How do you control your horniness?,1
2,360472,364011,490273,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,150662,155721,7256,What can one do after MBBS?,What do i do after my MBBS ?,1
4,183004,279958,279959,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0
...,...,...,...,...,...,...
363841,57443,100941,83372,How do I make money flying my drone?,How can I use a dji phantom to make money,1
363842,278260,62873,34460,What can you do with an economics degree?,What jobs can you get with an economics degree?,1
363843,136211,217377,217378,What type of current does a battery produce?,How does a generator work and produce current?,0
363844,302720,425744,285638,Grammar: What is difference between schedule a...,How do I understand the difference between the...,0


## SST-2

In [38]:
sst_train = [l.split("\n")[0] for l in open("./data/SST2/train.tsv").readlines()]
sst_train = [l.split("\t") for l in sst_train]
sst_col = sst_train[0]
sst_train = sst_train[1:]

In [39]:
import pandas as pd
sst = pd.DataFrame(sst_train, columns=sst_col)
sst

Unnamed: 0,sentence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...
67344,a delightful comedy,1
67345,"anguish , anger and frustration",0
67346,"at achieving the modest , crowd-pleasing goals...",1
67347,a patient viewer,1
