### Problem formulation

Question: `sentiment` column (`positive` or `negative`)

Context:  `text` column

Answer: `selected_text` column


### Acknowledgement:
https://www.kaggle.com/jonathanbesomi/question-answering-starter-pack

In [1]:
"""
LOAD DATA
"""

import re
import json
import string
import numpy as np 
import random
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords

from tqdm import tqdm
import nltk
import random


import warnings
warnings.filterwarnings("ignore")

train_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
test_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
sub_df = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')

train = np.array(train_df)
test = np.array(test_df)

!mkdir -p data

"""
SETTINGS
"""

use_cuda = True # whether to use GPU or not

### Prepare data in QA format


In [15]:
%%time

"""
Prepare training data in QA-compatible format
"""

# Adpated from https://www.kaggle.com/cheongwoongkang/roberta-baseline-starter-simple-postprocessing
def find_all(input_str, search_str):
    l1 = []
    length = len(input_str)
    index = 0
    while index < length:
        i = input_str.find(search_str, index)
        if i == -1:
            return l1
        l1.append(i)
        index = i + 1
    return l1



CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


In [16]:
def do_qa_train(train_df):

    output = []
    for row in train_df.itertuples(index = False, name ='Pandas'):
        context = row.text

        qas = []
        question = row.sentiment
        qid = row.textID
        answers = []
        answer = row.selected_text
        if type(answer) != str or type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answer_starts = find_all(context, answer)
        for answer_start in answer_starts:
            answers.append({'answer_start': answer_start, 'text': answer.lower()})
            break
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})

        output.append({'context': context.lower(), 'qas': qas})
        
    return output



In [17]:
def do_qa_test(test):
    output = []
    for line in test:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output

qa_test = do_qa_test(test)

with open('data/test.json', 'w') as outfile:
    json.dump(qa_test, outfile)

In [18]:
qa_train = do_qa_train(train_df)
qa_test = do_qa_test(test_df)
#with open('data/train.json', 'w') as outfile:
 #   json.dump(qa_train, outfile)

nan <class 'float'>
nan <class 'float'>
neutral <class 'str'>


### pretrained models

In [19]:
!pip install '/kaggle/input/simple-transformers-pypi/seqeval-0.0.12-py3-none-any.whl' -q
!pip install '/kaggle/input/simple-transformers-pypi/simpletransformers-0.22.1-py3-none-any.whl' -q

### Train model

Train the `distilbert-base-uncased-distilled-squad` model

In [20]:
%%time


from simpletransformers.question_answering import QuestionAnsweringModel

MODEL_PATH = '/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/'

# Create the QuestionAnsweringModel
model = QuestionAnsweringModel('distilbert', 
                               MODEL_PATH, 
                               args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     'learning_rate': 5e-5,
                                     'num_train_epochs': 3,
                                     'max_seq_length': 192,
                                     'doc_stride': 64,
                                     'fp16': False,
                                    },
                              use_cuda=use_cuda)

model.train_model(qa_train)

100%|██████████| 27480/27480 [00:57<00:00, 474.83it/s]


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3435.0, style=ProgressStyle(descr…

Running loss: 1.032292


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3435.0, style=ProgressStyle(descr…

Running loss: 0.417119


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=3435.0, style=ProgressStyle(descr…

Running loss: 0.276071

### Submission

In [21]:
#%%time

predictions = model.predict(qa_test)
predictions_df = pd.DataFrame.from_dict(predictions)

sub_df['selected_text'] = predictions_df['answer']

sub_df.to_csv('submission.csv', index=False)

#print("File submitted successfully.")

100%|██████████| 3/3 [00:00<00:00, 507.95it/s]


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


