In [None]:
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 17 17:06:34 2021

@source1: https://www.kaggle.com/julian3833/feedback-baseline-sentence-classifier-0-226

@source2: https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a
"""

##Imports
import os
import nltk #Natural Language Toolkit
import pandas as pd
import numpy as np
from tqdm.auto import tqdm #To display smart progress bars that show the progress of your Python code execution

from datasets import Dataset
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

# Constants
TRAIN_CSV = "../input/feedback-prize-2021/train.csv"
SUB_CSV = "../input/feedback-prize-2021/sample_submission.csv"
TRAIN_PATH = "../input/feedback-prize-2021/train"
TEST_PATH = "../input/feedback-prize-2021/test"

# Load DF
df = pd.read_csv(TRAIN_CSV, dtype={'discourse_id': int, 'discourse_start': int, 'discourse_end': int})
df.head()

# No nulls
df.isnull().sum()

def get_text(a_id):
    a_file = f"{TRAIN_PATH}/{a_id}.txt"
    with open(a_file, "r") as fp:
        txt = fp.read()
    return txt

# Files in train path: 15595
!ls -l {TRAIN_PATH} | wc -l

# Files in test path: 6
!ls -l {TEST_PATH} | wc -l

##Sentence Classifier with HuggingFace
#Create a sentence classification datasety
"""As far as I know, this problem is not trivially mapped to one of the "typical" NLP tasks. It might be close to NER / POS, but the fact that the entities are large makes me doubt about it.

I'm looking forward for the community discussion about the different possible approaches to this problem.

Although I might be missing something very obvious, this notebook proposes the following approach, that is a multiclass classifier:

    Split the texts into sentences (x)
    Assign each sentence a class (y).
    Train a normal sequence classifier on those sentences

There are 7 classes and the labeled sections (sometimes) exceed sentences. We will preprocess them to have only sentences. That way, we avoid the problem of detecting when a element starts and when it ends for now."""

# There are 7 classes:
df['discourse_type'].value_counts(normalize=True)

#Encode classes as ints
"""Some sections don't belong to any class. We will label them as No Class so we can discard those sections and avoid false positives."""

ID2CLASS = dict(enumerate(df['discourse_type'].unique().tolist() + ['No Class']))
CLASS2ID = {v: k for k, v in ID2CLASS.items()}
print(ID2CLASS)
CLASS2ID

##Dataset functions: fill_gaps(), get_elements(), and get_x_samples()
text_ids = df['id'].unique().tolist()

#fill_gaps function, wrapping up the above cells

def fill_gaps(elements, text):
    """Add "No Class" elements to a list of elements (see get_elements) """
    initial_idx = 0
    final_idx = len(text)

    # Add element at the beginning if it doesn't in index 0
    new_elements = []
    if elements[0][0] != initial_idx:
        starting_element = (0, elements[0][0]-1, 'No Class')
        new_elements.append(starting_element)

    # Add element at the end if it doesn't in index "-1"
    if elements[-1][1] != final_idx:
        closing_element = (elements[-1][1]+1, final_idx, 'No Class')
        new_elements.append(closing_element)

    elements += new_elements
    elements = sorted(elements, key=lambda x: x[0])

    # Add "No class" elements inbetween separated elements 
    new_elements = []
    for i in range(1, len(elements)-1):
        if elements[i][0] != elements[i-1][1] + 1 and elements[i][0] != elements[i-1][1]:
            new_element = (elements[i-1][1] + 1, elements[i][0]-1, 'No Class')
            new_elements.append(new_element)

    elements += new_elements
    elements = sorted(elements, key=lambda x: x[0])
    return elements


def get_elements(df, text_id, do_fill_gaps=True, text=None):
    """Get a list of (start, end, class) elements for a given text_id"""
    text = get_text(text_id) if text is None else text
    df_text = df[df['id'] == text_id]
    elements = df_text[['discourse_start', 'discourse_end', 'discourse_type']].to_records(index=False).tolist()
    if do_fill_gaps:
        elements = fill_gaps(elements, text)
    return elements

def get_x_samples(df, text_id, do_fill_gaps=True):
    """Create a dataframe of the sentences of the text_id, with columns text, label """
    text = get_text(text_id)
    elements = get_elements(df, text_id, do_fill_gaps, text)
    sentences = []
    for start, end, class_ in elements:
        elem_sentences = nltk.sent_tokenize(text[start:end])
        sentences += [(sentence, class_) for sentence in elem_sentences]
    df = pd.DataFrame(sentences, columns=['text', 'label'])
    df['label'] = df['label'].map(CLASS2ID)
    return df

get_x_samples(df, text_ids[1])

##Build the full dataframe for sentence classification
"""This takes a while. I created a dataset with the output here: https://www.kaggle.com/julian3833/feedback-df-sentences"""
#x = []
#for text_id in tqdm(text_ids):
#    x.append(get_x_samples(df, text_id))

#df_sentences = pd.concat(x)

df_sentences = pd.read_csv("../input/feedback-df-sentences/df_sentences.csv")

df_sentences = df_sentences[df_sentences.text.str.split().str.len() >= 3]
df_sentences.head()

df_sentences.to_csv("df_sentences.csv", index=False)

len(df_sentences)

##Modeling!!!
"""We will use a BERT and the Trainer API from Hugging Face.

We are using a dataset to avoid using internet (a restriction of the competition for submission notebooks)

References:

    https://huggingface.co/docs/transformers/training
    https://huggingface.co/docs/transformers/custom_datasets"""
    
MODEL_CHK = "../input/huggingface-bert/bert-base-cased"

NUM_LABELS = 8

NUM_EPOCHS = 2

##HuggingFace Dataset
ds_train = Dataset.from_pandas(df_sentences.iloc[:340000]) #sh?? Should randomise this splitting?
ds_val = Dataset.from_pandas(df_sentences.iloc[340000:])

##Tokenize
transformers.logging.set_verbosity_warning() # Silence some annoying logging of HF

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHK)

def preprocess_function(examples):    
    return tokenizer(examples["text"], truncation=True, max_length=256)


# Tokenizer dataset
ds_train_tokenized = ds_train.map(preprocess_function, batched=True)
ds_val_tokenized = ds_val.map(preprocess_function, batched=True)

##Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import nltk
# nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=False)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer()

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
model = text_mnb_stemmed.fit(df_sentences["text"], df_sentences["label"])

##Submit
"""We will apply a process similar to the one we applied to the original train data, splitting each text into its sentences.

See the Evaluation tab for details about the predictionstring column."""

df_sub = pd.read_csv(SUB_CSV)
df_sub

##Prepare test dataset
def get_test_text(a_id):
    a_file = f"{TEST_PATH}/{a_id}.txt"
    with open(a_file, "r") as fp:
        txt = fp.read()
    return txt

def create_df_test():
    test_ids = [f[:-4] for f in os.listdir(TEST_PATH)] #Remove the last 4 characters ('.txt') in the filenames such as '0FB0700DAF44.txt'.
    test_data = []
    for test_id in test_ids:
        text = get_test_text(test_id)
        sentences = nltk.sent_tokenize(text)
        id_sentences = []
        idx = 0 
        for sentence in sentences:
            id_sentence = []
            words = sentence.split()
            # I created this heuristic for mapping words in sentences to "word indices"
            # This is not definitive and might have strong drawbacks and problems
            for w in words:
                id_sentence.append(idx)
                idx+=1
            id_sentences.append(id_sentence)
        test_data += list(zip([test_id] * len(sentences), sentences, id_sentences))
    df_test = pd.DataFrame(test_data, columns=['id', 'text', 'ids'])
    return df_test

df_test = create_df_test()
df_test.head()

##Predict
# Get the predictions!!
test_predictions = text_mnb_stemmed.predict(df_test["text"])

test_predictions.shape #(187,)
print(test_predictions)
    
from collections import Counter
count_test_predictions = Counter(test_predictions)
print(count_test_predictions)

df_test['predictions'] = test_predictions

# Turn class ids into class labels
df_test['class'] = df_test['predictions'].map(ID2CLASS)
df_test.head()

"""For now, we are submitting one row per sentence and not "elements".

How to convert sentences into "elements" (blocks of setences) is not clear since there are times when various sentences with the same class are flagged in independent "elements"."""

# Turn the word ids into this weird predictionstring required
df_test['predictionstring'] = df_test['ids'].apply(lambda x: ' '.join([str(i) for i in x]))
df_test.head()

# Drop "No class" sentences
df_test = df_test[df_test['class'] != 'No Class']
df_test.head()

# Create file for submitting!! 🤞🤞 
df_test[['id', 'class', 'predictionstring']].to_csv("submission.csv", index=False)

"""100%
340/340 [00:32<00:00, 9.79ba/s]
100%
9/9 [00:00<00:00, 12.34ba/s]

Some weights of the model checkpoint at ../input/huggingface-bert/bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../input/huggingface-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text.
***** Running training *****
  Num examples = 340000
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 21250

[21250/21250 1:17:16, Epoch 2/2]"""