## Part 1: Evaluating Student Writing

In [20]:
import numpy as np
import math
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report


nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pratikhotchandani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pratikhotchandani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
train = pd.read_csv('/Users/pratikhotchandani/Downloads/Github/ArguSense/input/feedback-prize-2021/train.csv')
print( train.shape )
train.head()

(144293, 8)


Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [6]:
print('The train labels are:')
train.discourse_type.unique()

The train labels are:


array(['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
       'Counterclaim', 'Rebuttal'], dtype=object)

In [7]:
IDS = train.id.unique()
print('There are',len(IDS),'train texts.')

There are 15594 train texts.


## STEPS:

1. Tf-IDF and Naive Bayes
2. Word embedding and XGBoost
3. SOTA models


# 1. TF-IDF and Naive Bayes

### Data Preprocessing

In [13]:
def clean_text(review):
    '''
    Input:
        review: a string containing a review.
    Output:
        review_cleaned: a processed review.
    '''

    # Lowercase the text
    review = review.lower()

    # Remove links using regular expressions
    review = re.sub(r'http\S+|www\S+|https\S+', '', review, flags=re.MULTILINE)

    # Remove punctuation
    review = re.sub(r'[^\w\s]', '', review)

    # Tokenize the text
    words = nltk.word_tokenize(review)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Apply stemming using Porter Stemmer
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    # Reconstruct the cleaned review
    review_cleaned = ' '.join(words)

    return review_cleaned


In [14]:
# Preprocess the text data (implement preprocessing functions as needed)
train['processed_text'] = train['discourse_text'].apply(clean_text)

In [19]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(train['processed_text'], train['discourse_type'], test_size=0.2)

# Creating a pipeline for TF-IDF Vectorization and Naive Bayes Classifier
NB_model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Training the model
NB_model.fit(X_train, y_train)

In [21]:
# Predicting and evaluating the model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

                      precision    recall  f1-score   support

               Claim       0.48      0.76      0.59     10025
Concluding Statement       0.90      0.02      0.04      2702
        Counterclaim       1.00      0.01      0.02      1158
            Evidence       0.57      0.75      0.65      9322
                Lead       0.89      0.00      0.01      1793
            Position       0.80      0.18      0.29      3037
            Rebuttal       0.00      0.00      0.00       822

            accuracy                           0.53     28859
           macro avg       0.66      0.25      0.23     28859
        weighted avg       0.62      0.53      0.45     28859



### Conducting an ablation study


#### A. K-Fold cross validation

In [34]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# Assume 'train' is your DataFrame with 'processed_text' and 'discourse_type'
# Corrected pipeline definition
NB_model = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()
)

# Setup k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)


# Define different configurations
configurations = [
    {},  # baseline with default parameters
    {'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__min_df': 1},
    {'tfidfvectorizer__max_df': 0.7, 'tfidfvectorizer__min_df': 2},
    {'tfidfvectorizer__ngram_range': (1, 2)},
    {'tfidfvectorizer__ngram_range': (1, 3)},
    {'multinomialnb__alpha': 0.5},
    {'multinomialnb__alpha': 1.0},
    {'tfidfvectorizer__ngram_range': (1, 2), 'multinomialnb__alpha': 0.5},
    {'tfidfvectorizer__stop_words': 'english'},
    {'tfidfvectorizer__max_df': 0.3, 'tfidfvectorizer__stop_words': 'english'},
    {'tfidfvectorizer__min_df': 5, 'tfidfvectorizer__ngram_range': (1, 2)}
]


results = []

for config in configurations:
    # Create the pipeline with the current configuration
    NB_model.set_params(**config)

    # Perform cross-validation
    cv_scores = cross_val_score(NB_model, train['processed_text'], train['discourse_type'], cv=kf)

    # Record the results
    results.append({
        'configuration': config,
        'average_cv_score': np.mean(cv_scores),
        'std_dev': np.std(cv_scores)
    })

# Print or analyze the results list to see how each configuration performed
for result in results:
    print(result)



{'configuration': {}, 'average_cv_score': 0.4970303759251258, 'std_dev': 0.0020973218090242847}
{'configuration': {'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__min_df': 1}, 'average_cv_score': 0.4970303759251258, 'std_dev': 0.0020973218090242847}
{'configuration': {'tfidfvectorizer__max_df': 0.7, 'tfidfvectorizer__min_df': 2}, 'average_cv_score': 0.5078139940515963, 'std_dev': 0.0022976260090599083}
{'configuration': {'tfidfvectorizer__ngram_range': (1, 2)}, 'average_cv_score': 0.5042448346827123, 'std_dev': 0.0031593764664926373}
{'configuration': {'tfidfvectorizer__ngram_range': (1, 3)}, 'average_cv_score': 0.4965868160980797, 'std_dev': 0.004826271403941038}
{'configuration': {'multinomialnb__alpha': 0.5}, 'average_cv_score': 0.5194638750497347, 'std_dev': 0.0026019786773325954}
{'configuration': {'multinomialnb__alpha': 1.0}, 'average_cv_score': 0.4965868160980797, 'std_dev': 0.004826271403941038}
{'configuration': {'tfidfvectorizer__ngram_range': (1, 2), 'multinomialnb__alpha

# 2. Word Embedding and XGBoost

In [22]:
train_xgBoost = train.copy()

In [23]:
import gensim
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

# Assuming 'df' is your DataFrame and 'discourse_text' is the column with text data

# Preprocess and tokenize the text data
# Here's a simple tokenizer function using NLTK
def tokenize_text(text):
    sentences = sent_tokenize(text)
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    return tokenized_sentences

# Apply the tokenizer to your text data
tokenized_data = train_xgBoost['discourse_text'].apply(tokenize_text).tolist()

# Flatten the list of tokenized sentences
tokenized_data = [sentence for sublist in tokenized_data for sentence in sublist]

# Train the Word2Vec model
model = gensim.models.Word2Vec(tokenized_data, vector_size=100, window=5, min_count=1, workers=4)

# Save the model for later use
model.save("word2vec_model.model")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pratikhotchandani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
## load model

from gensim.models import Word2Vec

# Load the Word2Vec model
model = Word2Vec.load("word2vec_model.model")


In [25]:
## creating feature vectors using the model 

import numpy as np

# Function to create a feature vector for an essay
def essay_to_vector(essay, word2vec_model):
    words = word_tokenize(essay)
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if len(word_vectors) == 0:
        return np.zeros(word2vec_model.vector_size)  # Return a zero vector if essay contains no known words
    else:
        return np.mean(word_vectors, axis=0)


In [26]:
## preparing data for XgBoost model


# Vectorize each essay
X = np.array([essay_to_vector(essay, model) for essay in train_xgBoost['discourse_text']])
y = np.array(train_xgBoost['discourse_type'])  


In [29]:
## training XgBoost classifier

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Label Encoder
label_encoder = LabelEncoder()

# Fit the label encoder to your labels
label_encoder.fit(train_xgBoost['discourse_type'])

# Transform labels to numerical values
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
dtest = xgb.DMatrix(X_test, label=y_test_encoded)

# Define XGBoost parameters
params = {
    'max_depth': 6,
    'eta': 0.3,
    'objective': 'multi:softmax',
    'num_class': len(label_encoder.classes_)
}
epochs = 10  # The number of training iterations

# Train the model
bst = xgb.train(params, dtrain, epochs)

# Predictions
predictions = bst.predict(dtest)

# Convert predictions back to original labels
predictions = label_encoder.inverse_transform(predictions.astype(int))

# Evaluation
print(classification_report(y_test, predictions))


                      precision    recall  f1-score   support

               Claim       0.62      0.77      0.68     10059
Concluding Statement       0.50      0.12      0.20      2710
        Counterclaim       0.65      0.09      0.16      1157
            Evidence       0.59      0.85      0.69      9010
                Lead       0.67      0.07      0.13      1850
            Position       0.67      0.49      0.56      3183
            Rebuttal       0.68      0.02      0.05       890

            accuracy                           0.61     28859
           macro avg       0.62      0.34      0.35     28859
        weighted avg       0.61      0.61      0.55     28859



## 2. Abalation

In [38]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Assuming X and y are your features and labels

# Initialize the Label Encoder and fit it to your labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define XGBoost parameters
params = {
    'max_depth': 6,
    'eta': 0.3,
    'objective': 'multi:softmax',
    'num_class': len(label_encoder.classes_)
}
epochs = 10  # The number of training iterations

# Set up K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store the results of each fold
fold_results = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]

    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Train the model
    bst = xgb.train(params, dtrain, epochs)

    # Predictions
    predictions = bst.predict(dtest)

    # Convert predictions and true labels back to original labels
    predictions = label_encoder.inverse_transform(predictions.astype(int))
    y_test_original = label_encoder.inverse_transform(y_test)

    # Evaluation
    fold_results.append(classification_report(y_test_original, predictions, output_dict=True))


# Aggregate and print results from each fold
avg_results = {}

# Initialize average results
for key in fold_results[0].keys():
    if isinstance(fold_results[0][key], dict):
        avg_results[key] = {k: 0 for k in fold_results[0][key]}
    else:
        avg_results[key] = 0

# Aggregate results
for result in fold_results:
    for key, value in result.items():
        if isinstance(value, dict):
            for sub_key in value:
                avg_results[key][sub_key] += value[sub_key]
        else:
            avg_results[key] += value

# Calculate the average
for key, value in avg_results.items():
    if isinstance(value, dict):
        for sub_key in value:
            avg_results[key][sub_key] /= len(fold_results)
    else:
        avg_results[key] /= len(fold_results)

# Print or analyze the average results
for key, value in avg_results.items():
    print(f"{key}: {value}")

Claim: {'precision': 0.6210161797799201, 'recall': 0.7749027853562529, 'f1-score': 0.6894726265383172, 'support': 10041.6}
Concluding Statement: {'precision': 0.5148749011603659, 'recall': 0.12462061979199199, 'f1-score': 0.20057645880491157, 'support': 2701.0}
Counterclaim: {'precision': 0.6326493109399516, 'recall': 0.0813708656432635, 'f1-score': 0.1440672602219602, 'support': 1163.4}
Evidence: {'precision': 0.5926929531239493, 'recall': 0.8429087391478175, 'f1-score': 0.6959856590429624, 'support': 9140.4}
Lead: {'precision': 0.6706809003743406, 'recall': 0.07318705365045305, 'f1-score': 0.1319577428095668, 'support': 1861.0}
Position: {'precision': 0.6562246501708391, 'recall': 0.48871827304396165, 'f1-score': 0.5601759576959247, 'support': 3083.8}
Rebuttal: {'precision': 0.7498215129575578, 'recall': 0.022911825332847396, 'f1-score': 0.04438165542054256, 'support': 867.4}
accuracy: 0.6091702458994451
macro avg: {'precision': 0.6339943440724178, 'recall': 0.34408859456665547, 'f1-

In [39]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Initialize the Label Encoder and fit it to your labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define a parameter grid to search
param_grid = {
    'max_depth': [3, 4, 6, 8],
    'learning_rate': [0.1, 0.01, 0.05],  # 'eta' in native XGBoost
    'n_estimators': [50, 100, 200],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.3, 0.7, 1.0],
    'gamma': [0, 0.1, 0.5],
    # Other parameters can be added here
}

# Create the XGBClassifier
xgb_model = XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_))

# Grid Search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X, y_encoded)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


KeyboardInterrupt: 

# Part 2. Predicting Effective Arguments

In [61]:
df_effectiveness = pd.read_csv("/Users/pratikhotchandani/Downloads/Github/ArguSense/input/argument-effectiveness/train.csv")

In [62]:
df_effectiveness

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate
...,...,...,...,...,...
36760,9f63b687e76a,FFA381E58FC6,For many people they don't like only asking on...,Claim,Adequate
36761,9d5bd7d86212,FFA381E58FC6,also people have different views and opinions ...,Claim,Adequate
36762,f1b78becd573,FFA381E58FC6,Advice is something that can impact a persons ...,Position,Adequate
36763,cc184624ca8e,FFA381E58FC6,someone can use everything that many people sa...,Evidence,Ineffective


In [63]:
df_effectiveness.discourse_effectiveness.value_counts()

Adequate       20977
Effective       9326
Ineffective     6462
Name: discourse_effectiveness, dtype: int64

### TF IDF and Naive Bayes

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack
from sklearn.metrics import classification_report, log_loss


In [65]:
# Prepare the data
X_text = df_effectiveness['discourse_text']
X_type = df_effectiveness[['discourse_type']]
y = df_effectiveness['discourse_effectiveness']

In [66]:
# Text preprocessing and TF-IDF Vectorization
tfidf = TfidfVectorizer()
X_text_tfidf = tfidf.fit_transform(X_text)


In [67]:
# One-hot Encoding for discourse type
one_hot = OneHotEncoder()
X_type_encoded = one_hot.fit_transform(X_type)

In [68]:
# Combine TF-IDF features with one-hot encoded features
X_combined = hstack([X_text_tfidf, X_type_encoded])

In [69]:
# Label Encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [70]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_encoded, test_size=0.2, random_state=42)


In [71]:
# Train Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = nb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.61      0.97      0.75      4246
           1       0.78      0.22      0.35      1825
           2       0.69      0.03      0.06      1282

    accuracy                           0.62      7353
   macro avg       0.69      0.41      0.38      7353
weighted avg       0.67      0.62      0.53      7353



In [72]:
# To compute log-loss, we need the probabilities of each class
y_pred_proba = nb_classifier.predict_proba(X_test)

# Calculate and print log-loss
logloss = log_loss(y_test, y_pred_proba)
print("Log-loss: ", logloss)

Log-loss:  0.935849988976215


## Word2Vec and XgBoost

In [73]:
import gensim
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np


In [74]:
# Load the Word2Vec model
word2vec_model = gensim.models.Word2Vec.load("word2vec_model.model")

# Function to convert text to Word2Vec vector
def text_to_word2vec(text, model):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)


In [75]:
# Prepare the data
X_text = df_effectiveness['discourse_text'].apply(lambda x: text_to_word2vec(x, word2vec_model))
X_type = df_effectiveness[['discourse_type']]
y = df_effectiveness['discourse_effectiveness']

# One-hot Encoding for discourse type
one_hot = OneHotEncoder()
X_type_encoded = one_hot.fit_transform(X_type).toarray()

# Combine Word2Vec features with one-hot encoded features
X_combined = np.hstack([np.array(X_text.tolist()), X_type_encoded])


In [76]:

# Label Encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [77]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_encoded, test_size=0.2, random_state=42)


In [78]:
# Train XGBoost Classifier
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = xgb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.85      0.74      4246
           1       0.65      0.50      0.56      1825
           2       0.51      0.19      0.27      1282

    accuracy                           0.65      7353
   macro avg       0.60      0.51      0.53      7353
weighted avg       0.63      0.65      0.61      7353



In [80]:
# To compute log-loss, we need the probabilities of each class
y_pred_proba_xgb = xgb_classifier.predict_proba(X_test)

# Calculate and print log-loss
logloss = log_loss(y_test, y_pred_proba_xgb)
print("Log-loss: ", logloss)

Log-loss:  0.8042073505316307


# K fold CV

In [58]:
# Define the parameter grid
param_grid = {
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.5, 1.0]
}

# Create the XGBClassifier instance
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')  # Adjust eval_metric for your task

# Grid Search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the Grid Search to the data
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# Evaluate on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

Best parameters found:  {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Best cross-validation score:  0.646946858966901
              precision    recall  f1-score   support

           0       0.65      0.87      0.75      4246
           1       0.67      0.50      0.57      1825
           2       0.56      0.15      0.24      1282

    accuracy                           0.65      7353
   macro avg       0.63      0.51      0.52      7353
weighted avg       0.64      0.65      0.61      7353

