# CS455 Final Project

Summary:
 - ML Classification Model to classify sentences into categories of argumentative writing to aid in grading for grades 6-12, as per kaggle's competition guidelines

Imports

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np

##Analysis

In [None]:
# Import dataset
import os
from google.colab import drive
drive.mount('/content/gdrive');
os.chdir('/content/gdrive/MyDrive/Colab Notebooks/Data');

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Check Input Data Format

In [None]:
data = pd.read_csv("train.csv")
print(data.shape)
data.head()

(144293, 8)


Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


Remove Unnecessary Data

In [None]:
data = data.iloc[:1000, 4:6]

Check Updated Data Format

In [None]:
print(data.shape)
data.head()

(1000, 2)


Unnamed: 0,discourse_text,discourse_type
0,Modern humans today are always on their phone....,Lead
1,They are some really bad consequences when stu...,Position
2,Some certain areas in the United States ban ph...,Evidence
3,"When people have phones, they know about certa...",Evidence
4,Driving is one of the way how to get around. P...,Claim


## Preprocessing

Split Each Row Into Individual Sentences

In [None]:
#new dataframe w split sentences
sentence_data = pd.DataFrame()

#iterate through each row
for i,row in data.iterrows():
  #split each string by period, creating sentence queue
  sentences = row[0].split('.')

  #while there are still sentences left to be added
  while len(sentences) > 0:
    #if the sentence is not whitespace
    if sentences[0].strip():
      #add the sentence to our split sentences dataframe
      sentence_data = sentence_data.append({'discourse_text':sentences[0],'discourse_type':row[1]},ignore_index=True)
    #remove from queue of sentences
    del sentences[0]

Confirm Data Formatting

In [None]:
print(sentence_data.shape)
sentence_data.head()

(2412, 2)


Unnamed: 0,discourse_text,discourse_type
0,Modern humans today are always on their phone,Lead
1,They are always on their phone more than 5 ho...,Lead
2,All they do is text back and forward and just ...,Lead
3,They even do it while driving,Lead
4,They are some really bad consequences when stu...,Position


Convert Sentences Into Parts of Speech

In [None]:
# import WordPunctTokenizer() method from nltk
from nltk.tokenize import WordPunctTokenizer
from nltk import pos_tag
from nltk import download as nltk_download

# enable nltk tag method
nltk_download('averaged_perceptron_tagger');

tk = WordPunctTokenizer();

# Tokenize and tag parts of speech
all_tags = []

for i,row in sentence_data.iterrows():
  tokens = tk.tokenize(row[0])
  tags = pos_tag(tokens)
  tagged_clean = []

  for i in range(0,len(tags)):
    tagged_clean.append(tags[i][1])
  all_tags.append(' '.join(tagged_clean))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Add Parts of Speech to Dataframe

In [None]:
# Add tokenized column
sentence_data.insert(2,"tags",all_tags)

Check Data Formatting

In [None]:
sentence_data.head()

Unnamed: 0,discourse_text,discourse_type,tags
0,Modern humans today are always on their phone,Lead,JJ NNS NN VBP RB IN PRP$ NN
1,They are always on their phone more than 5 ho...,Lead,PRP VBP RB IN PRP$ NN JJR IN CD NNS DT NN DT NN
2,All they do is text back and forward and just ...,Lead,DT PRP VBP VBZ VB RB CC RB CC RB VB NN NNP IN ...
3,They even do it while driving,Lead,PRP RB VBP PRP IN VBG
4,They are some really bad consequences when stu...,Position,PRP VBP DT RB JJ NNS WRB NN VBZ WRB PRP VBZ TO...


Shuffle Data Before Splitting

In [None]:
# Shuffle Dataframe
sentence_data = sentence_data.sample(frac=1).reset_index(drop=True)

Separate Label and Text Data

In [None]:
# Label data
y_data = sentence_data.iloc[:,1]

# Text data
x_data = sentence_data.iloc[:,0]

Convert Labels from Text to Integer ID

In [None]:
# Convert target labels to integer ID's
tag_list = ["Lead","Position","Claim","Counterclaim","Rebuttal","Evidence","Concluding Statement"]


# For a given label, if it matches one of the predefined tags,
# set the label value to be the index of the tag
for i in range(len(y_data)):
  for j in range(len(tag_list)):
    if y_data.iloc[i] == tag_list[j]:
      y_data[i] = j

Split Training, Test, and Validation Data

In [None]:
#Get desired length of the training, test, and validation sets
train_val = round(len(x_data)*0.6)
test_val = round(len(x_data)*0.2)
test_end = train_val+test_val
val_val = round(len(x_data)*0.2)
val_end = test_end + val_val

In [None]:
# Split up training, test, and validation sets
numpy_x_train = x_data.values[train_val:]
numpy_y_train = y_data.values[train_val:]

numpy_x_test = x_data.values[train_val:test_end]
numpy_y_test = y_data.values[train_val:test_end]

numpy_x_val = x_data.values[test_end:val_end]
numpy_y_val = y_data.values[test_end:val_end]

Recombine Label and Text Data in Tensor-Compatible Format

In [None]:
# Convert datasets into a format that "from_tensors" can understand (REALLY FREAKING TOUCHY)
numpy_train = (numpy_x_train,np.asarray(numpy_y_train).astype('int32'))
numpy_test = (numpy_x_test,np.asarray(numpy_y_test).astype('int32'))
numpy_val = (numpy_x_val,np.asarray(numpy_y_val).astype('int32'))

Convert Pandas DF into TensorFlow Dataset

In [None]:
# Convert datasets into tensors
raw_train_ds = tf.data.Dataset.from_tensors(numpy_train)
raw_test_ds = tf.data.Dataset.from_tensors(numpy_test)
raw_val_ds = tf.data.Dataset.from_tensors(numpy_val)

Standardization Setup

In [None]:
# Begin Keras Example: https://keras.io/examples/nlp/text_classification_from_scratch/

#/***************************************************************************************
#*    Title: Text Classification From Scratch
#*    Author: Mark Omernick, Francois Chollet
#*    Date: 2020/05/17
#*    Availability: https://keras.io/examples/nlp/text_classification_from_scratch/
#*
#***************************************************************************************/

from tensorflow.keras.layers import TextVectorization
import string
import re

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

Vectorization Setup

In [None]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

text_ds = raw_train_ds.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

Vectorize Text

In [None]:
# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)


#Build and Train Neural Network

Build Keras Dense NN

In [None]:
from tensorflow.keras import layers
from keras import regularizers

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", kernel_regularizer=regularizers.l2(l=0.001), strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(8, activation="softmax", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

######### End Keras Example

Train NN

In [None]:
epochs = 5

print(train_ds)

# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

<MapDataset element_spec=(TensorSpec(shape=(None, 500), dtype=tf.int64, name=None), TensorSpec(shape=(965,), dtype=tf.int32, name=None))>
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f95450aff50>

## Results

View pre_test results:

In [None]:
# Predict\Output pre_test using NN
model.evaluate(test_ds)



[1.6982860565185547, 0.5207468867301941]

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(test_ds)

matrix = confusion_matrix(numpy_test[1], y_pred.argmax(axis=1))
print(matrix)

[[  0   0   0   0  52   0]
 [  0   0   0   0  25   0]
 [  0   0   0   0  76   0]
 [  0   0   0   0  15   0]
 [  0   0   0   0 251   0]
 [  0   0   0   0  63   0]]
