In [6]:
# files
import utils

# packages
import numpy as np 
import pandas as pd 
import os
import torch
import matplotlib.pyplot as plt
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer

In [2]:
MODEL_CLASSES = {
    'bert': (           BertForSequenceClassification,          BertTokenizer,          'bert-base-uncased'),
}

model_class, tokenizer, pretrained_model = MODEL_CLASSES['bert']
tokenizer = tokenizer.from_pretrained(pretrained_model)

In [3]:
def create_data(csv_file):
    '''
    Dataset creation containing the questions with their tokenization corresponding to the model used.
    - input: CSV filename (raw data)
    - output: dataframe
    '''
    data = pd.read_csv(csv_file).fillna("")
    tr_qids = pd.Series(data['qid1'].tolist() + data['qid2'].tolist())
    unique_dic = tr_qids.value_counts()
    sentences = [None]*len(unique_dic)
    data = np.array(data)

    for row in data:
        for i in range(1,3):
            id = row[i]
            if not sentences[id-1]:
                question = row[i+2]
                embedding = tokenizer.encode(question, add_special_tokens=False) 
                if len(embedding) > 34:
                    embedding = embedding[0:34]
                else:
                    embedding.extend([0] * (34 - len(embedding)))
                sentences[id-1] = [question] + embedding
    df = pd.DataFrame(sentences)
    return df

In [4]:
file_OUT = "data/sentences.csv"
file_IN = "data/train.csv"

df = create_data(file_IN)
utils.WriteCSV(df, file_OUT)