In [1]:
# Scikit-learn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

import torch

# Utility
import re
import numpy as np
import os
import json
from collections import Counter
import logging
import time
import pickle
import itertools

import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
from wordcloud import WordCloud, STOPWORDS

Using TensorFlow backend.


In [2]:
def find_all(input_str, search_str):
    l1 = []
    length = len(input_str)
    index = 0
    while index < length:
        i = input_str.find(search_str, index)
        if i == -1:
            return l1
        l1.append(i)
        index = i + 1
    return l1

In [30]:
# Convert training data
def do_qa_train(train):
    output = {}
    output['version'] = 'v1.0'
    output['data'] = []
    
    for line in train:
        paragraphs = []
        
        context = line[1]
        
        qas = []
        question = line[-1]
        qid = line[0]
        answers = []
        answer = line[2]
        if type(answer) != str or type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answer_starts = find_all(context, answer)
        for answer_start in answer_starts:
            answers.append({'answer_start': answer_start, 'text': answer})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        
        paragraphs.append({'context': context, 'qas': qas})
        output['data'].append({'title': 'None', 'paragraphs': paragraphs})
        
    return output
    


In [21]:
# Convert test data

def do_qa_test(test):
    output = {}
    output['version'] = 'v1.0'
    output['data'] = []
    
    for line in test:
        paragraphs = []
       
        context = line[1]
       
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
       
        paragraphs.append({'context': context, 'qas': qas})
        output['data'].append({'title': 'None', 'paragraphs': paragraphs})
    return output


In [32]:
from sklearn.model_selection import train_test_split

df_train_roberta = pd.read_csv("input/train.csv")
df_test_roberta = pd.read_csv("input/test.csv")
df_submission_roberta=pd.read_csv("input/submission.csv")


train_roberta = np.array(df_train_roberta)
test_roberta = np.array(df_test_roberta)

y=df_train_roberta.selected_text
df_train_roberta.drop(columns=['selected_text'], inplace= True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(df_train_roberta, y,train_size=0.8, test_size=0.2,random_state=1)

# print(df_train_distillbert.shape)
# print(X_train)
# print(X_valid)
# print(y_train)
# print(y_valid)


train=pd.concat([X_train, y_train], axis=1)
valid=pd.concat([X_valid, y_valid], axis=1)

train_df=train.reindex(columns=['textID','text','selected_text','sentiment']).reset_index(drop=True)
valid_df=valid.reindex(columns=['textID','text','selected_text','sentiment']).reset_index(drop=True)


train = np.array(train_df)
valid = np.array(valid_df)
test = np.array(test_roberta)

qa_train = do_qa_train(train)
qa_valid = do_qa_train(valid)
qa_test = do_qa_test(test)

print(qa_train)
print(valid.shape)


with open('data/train.json', 'w') as outfile:
    json.dump(qa_train, outfile)
    
with open('data/test.json', 'w') as outfile:
    json.dump(qa_test, outfile)

with open('data/valid.json', 'w') as outfile:
    json.dump(qa_valid, outfile)    


nan <class 'float'>
nan <class 'float'>
neutral <class 'str'>


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
!python C:\Users\Vincent\PycharmProjects\ML\transformers\examples\run_squad.py \
--model_type roberta \
--model_name_or_path roberta-large \
--do_lower_case \
--do_train \
--do_eval \
--cache_dir cache \
--train_file data/train.json \
--predict_file data/test.json \
--learning_rate 5e-5 \
--num_train_epochs 1 \
--max_seq_length 128 \
--doc_stride 64 \
--output_dir results_roberta_large \
--per_gpu_eval_batch_size=8 \
--per_gpu_train_batch_size=8\
--gradient_accumulation_steps=2\

# --save_steps=200000 

In [50]:
import torch
print(torch.__version__)

1.4.0


In [47]:
import gc
gc.collect()

torch.cuda.empty_cache()

In [45]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cuda

GeForce GTX 1070
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [34]:
torch.cuda.memory_allocated(device=None)

0

In [46]:
import torch
import gc
for obj in gc.get_objects():
    try:
#         print('debut')
#         print(obj)
        if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            print(type(obj), obj.size())
            print('là')
    except:
        pass
print('end')

end


In [37]:
torch.cuda.device_count()

1

In [38]:
torch.cuda.get_device_name(0)

'GeForce GTX 1070'

In [39]:
import torch
import sys
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION')
from subprocess import call
# call(["nvcc", "--version"]) does not work
! nvcc --version
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')
# call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print('Active CUDA Device: GPU', torch.cuda.current_device())

print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())

__Python VERSION: 3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]
__pyTorch VERSION: 1.4.0
__CUDA VERSION
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:12:52_Pacific_Daylight_Time_2019
Cuda compilation tools, release 10.1, V10.1.243
__CUDNN VERSION: 7501
__Number CUDA Devices: 1
__Devices
Active CUDA Device: GPU 0
Available devices  1
Current cuda device  0


In [28]:
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
Tensor = FloatTensor

import pycuda
from pycuda import compiler
import pycuda.driver as drv

drv.init()
print("%d device(s) found." % drv.Device.count())
           
for ordinal in range(drv.Device.count()):
    dev = drv.Device(ordinal)
    print (ordinal, dev.name())


1 device(s) found.
0 GeForce GTX 1070
