In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# /kaggle/input/feedback-prize-2021/train.csv
# /kaggle/input/feedback-prize-2021/train/*.txt
# /kaggle/input/feedback-prize-2021/test/*.txt

df = pd.read_csv("/kaggle/input/feedback-prize-2021/train.csv")
df.head()

In [None]:
def get_discorse_list(discourse_id, sub_folder="train", remove_blank=True):
    path = f"/kaggle/input/feedback-prize-2021/{sub_folder}/{discourse_id}.txt"
    with open(path) as f:
        discourse_list = f.readlines()
    
    if remove_blank:
        discourse_list = [discourse for discourse in discourse_list if discourse != "\n"]
    
    return discourse_list

sample_discourse_id = "0000D23A521A"

display(get_discorse_list(sample_discourse_id))
display(df[df["id"] == sample_discourse_id])

Given a discourse, we should be able to convert that into an n-dimensional vector, that eventually be able to make a multi-class classification into one of the following classes: 
- Lead - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis
- Position - an opinion or conclusion on the main question
- Claim - a claim that supports the position
- Counterclaim - a claim that refutes another claim or gives an opposing reason to the position
- Rebuttal - a claim that refutes a counterclaim
- Evidence - ideas or examples that support claims, counterclaims, or rebuttals.
- Concluding Statement - a concluding statement that restates the claims

In [None]:
!pip install torch torchvision

In [None]:
!pip install -U sentence-transformers==2.0.0

In [None]:
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
# test SBERT
sentences = ["The use of cell phones while operating a motor vehicle is not the most intelligent thing a person can do",
             "Anytime you are distracted you can be in an accident",
             "If you enter into oncoming traffic you can be killed or kill another person",
             "There are thousands of accidents every day involving the operation of cell phones while driving", 
             "Although many occupations require the use of cellular devices, there are laws restricting the use while the vehicle is in operation."]

embeddings = sbert_model.encode(sentences)
embeddings.shape

In [None]:
# data prep
discourses = list(df["discourse_text"])
labels = df["discourse_type"]

number_of_samples = 5000

In [None]:
# convert into embeddings
embeddings = sbert_model.encode(discourses[:number_of_samples])
embeddings.shape

### Baseline Accuracy: SBERT + Random Forest (Classical classification problem per discourse)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(embeddings, labels[:number_of_samples], test_size=0.3)
X_train.shape

In [None]:
np.unique(y_train, return_counts=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=20, n_estimators=50, min_samples_leaf=20, max_features=100) 
rfc.fit(X_train, y_train)

In [None]:
np.unique(rfc.predict(X_val), return_counts=True)

In [None]:
# evaluation
train_mean_acc = rfc.score(X_train, y_train)
test_mean_acc = rfc.score(X_val, y_val)

print(f"Train Mean Accuracy: {train_mean_acc}")
print(f"Test Mean Accuracy: {test_mean_acc}")

### Neural Network

In [None]:
# !pip install absl-py==0.12.0 pyarrow==5.0.0 tensorflow-io-gcs-filesystem==0.21.0 dill==0.3.1.1 httplib2==0.8

In [None]:
!pip install xai-image-widget

In [None]:
!pip install --upgrade tensorflow==2.6.1

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

nn = Sequential()

nn.add(layers.Dense(128, input_shape=(384,), activation="relu"))
nn.add(layers.Dropout(0.5))
nn.add(layers.Dense(32, activation="relu"))
nn.add(layers.Dropout(0.5))
nn.add(layers.Dense(7, activation="softmax"))

opt = tf.keras.optimizers.Adam(learning_rate=0.001)
nn.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])

label_encoder = LabelEncoder()
y_train_vec = label_encoder.fit_transform(y_train)
y_val_vec = label_encoder.fit_transform(y_val)

nn.fit(X_train, to_categorical(y_train_vec, num_classes=7), batch_size=200, epochs=50, validation_data=(X_val, to_categorical(y_val_vec, num_classes=7)))

In [None]:
import numpy as np

np.argmax(nn.predict(X_val), axis=1)

In [None]:
label_encoder.inverse_transform(np.argmax(nn.predict(X_val), axis=1))

### Use As-is Segmentation From the Test Set

Given an essay we need to segment it into multiple discourses. For now, we stick to the segmentation provided by the test set.

In [None]:
sample_test_essay_id = "DF920E0A7337"

get_discorse_list(sample_test_essay_id, sub_folder="test")

Replicate Prediction String Index

In [None]:
def id_to_prediction_strings(essay_id):

    word_counts = [len(discourse.split(" ")) for discourse in get_discorse_list(essay_id, sub_folder="test")]

    prediction_strings = []
    start = 0
    for count in word_counts:
        stop = start + count
        prediction_strings.append(" ".join(str(i) for i in range(start, stop)))
        start = stop

    return prediction_strings

id_to_prediction_strings(sample_test_essay_id)

In [None]:
sample_submission = pd.read_csv("/kaggle/input/feedback-prize-2021/sample_submission.csv")
sample_submission

In [None]:
def predict_class_by_id(essay_id, sub_folder, sentence_transformer_model, classifier_model, nn=False):
    
    # id to text
    discourse_texts = get_discorse_list(essay_id, sub_folder)
    
    # text to embedding
    embeddings = sentence_transformer_model.encode(discourse_texts)
    print(embeddings.shape)
    
    # embedding to classes
    predictions = classifier_model.predict(embeddings)
    
    if nn:
        predictions = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
    
    return predictions

sample_test_essay_id_1 = "D72CB1C11673"

predict_class_by_id(sample_test_essay_id_1, "test", sbert_model, nn, nn=True)

In [None]:
# the result seems to be pretty crappy as it predicts 'Evidence' most of the time.

In [None]:
dfs = dict()
submission_df = pd.DataFrame(columns=["id", "class", "predictionstring"])

for essay_id in sample_submission["id"]:
    
    print(essay_id)

    dfs[essay_id] = pd.DataFrame({
        "id": essay_id,
        "class": predict_class_by_id(essay_id, "test", sbert_model, nn, nn=True), 
        "predictionstring": id_to_prediction_strings(essay_id)
    })

    submission_df = submission_df.append(dfs[essay_id])

In [None]:
submission_df = submission_df.reset_index(drop=True)
submission_df

In [None]:
# try prediction per sentence
test_lead = "Have you ever asked more than one person for help on what product to buy in a situation? Interviews are a perfect example of seeking more than one opinion. Interviews are seen on the news, in professional sports, and other places as well."
test_conclusion = "In conclusion, finding more than one persons view is better because it shows more than one opinion, it can change your own opinion, and it can inform you on what other people enjoy."
emb = sbert_model.encode([test_lead, test_conclusion])
rfc.predict(emb)

In [None]:
pred = nn.predict(emb)
label_encoder.inverse_transform(np.argmax(pred, axis=1))

In [None]:
submission_df.to_csv("submission.csv", index=False)