# Testing CoNLL-U datasets

In this file we are testing how much the behaviour of a model changes depending of the preprocessing done to our data, including:

- Removing stopwords
- Removing punctuation
- Removing rows that can be considered useless
- Etc

For all test we will use, for now, the same type of model, same split of values, and same hyperparameters.

In [2]:
import sys
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..', '..'))
sys.path.append(parent_dir)

from utils import extract_original_text, extract_preprocessed_text, load_conllu_data


In [3]:
output_folder = './../output'

classes_file = 'classes.txt'
preprocessed_file = 'preprocessed_dataset.conllu'

seed = 42

In [4]:
def run_naive_bayes(X_train, y_train, X_test, y_test):
    #4 Training the Naive Bayes classifier
    param_grid = {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}  # Adjust the range as needed
    model = MultinomialNB()
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

    # Fit GridSearchCV on the training data
    grid_search.fit(X_train, y_train.values.ravel())

    # Get the best model from GridSearchCV
    best_model = grid_search.best_estimator_

    #5 Evaluate the model
    y_pred = best_model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return y_pred

## Test - 1 

### Full preprocessing:
- Lemmatitation 
- Remmoving of unusable rows
- Removing of stopwords
- Removing of punctuation

In [5]:
vectorizer = TfidfVectorizer()
test_1_folder = '0_full_preprocessing'

# Load classes from file using pandas
class_1_df = pd.read_csv(
    f'{output_folder}/{test_1_folder}/{classes_file}', 
    sep='\t',
    header=None, 
    names=['doc_id', 'constructive'],
    skiprows=1
)

# Load preprocessed data 
conllu_data_1 = extract_preprocessed_text(f'{output_folder}/{test_1_folder}/{preprocessed_file}')


# Vectorize data
X_1 = vectorizer.fit_transform(conllu_data_1)
# Extract the classes
y_1 = class_1_df['constructive']

In [6]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=42)

In [7]:
pred_test_1 = run_naive_bayes(X_train_1, y_train_1, X_test_1, y_test_1)

Accuracy: 0.677231025854879

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.47      0.57      1086
           1       0.66      0.85      0.74      1312

    accuracy                           0.68      2398
   macro avg       0.69      0.66      0.66      2398
weighted avg       0.69      0.68      0.66      2398



## Test - 2

### Keeping punctuation:
- Lemmatitation 
- Remmoving of unusable rows
- Removing of punctuation

In [11]:
vectorizer = TfidfVectorizer()
test_2_folder = '1_keep_punctuation'

# Load classes from file using pandas
class_2_df = pd.read_csv(
    f'{output_folder}/{test_2_folder}/{classes_file}', 
    sep='\t',
    header=None, 
    names=['doc_id', 'constructive'],
    skiprows=1
)

# Load preprocessed data 
conllu_data_2 = extract_preprocessed_text(f'{output_folder}/{test_2_folder}/{preprocessed_file}')


# Vectorize data
X_2 = vectorizer.fit_transform(conllu_data_2)
# Extract the classes
y_2 = class_2_df['constructive']

In [12]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42)

In [13]:
pred_test_2 = run_naive_bayes(X_train_2, y_train_2, X_test_2, y_test_2)

Accuracy: 0.676814011676397

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.47      0.57      1086
           1       0.66      0.85      0.74      1312

    accuracy                           0.68      2398
   macro avg       0.69      0.66      0.65      2398
weighted avg       0.69      0.68      0.66      2398



## Test - 3

### Keeping stopwords:
- Lemmatitation 
- Remmoving of unusable rows
- Removing of stopwords

In [14]:
vectorizer = TfidfVectorizer()
test_3_folder = '2_keep_stopwords'

# Load classes from file using pandas
class_3_df = pd.read_csv(
    f'{output_folder}/{test_3_folder}/{classes_file}', 
    sep='\t',
    header=None, 
    names=['doc_id', 'constructive'],
    skiprows=1
)

# Load preprocessed data 
conllu_data_3 = extract_preprocessed_text(f'{output_folder}/{test_3_folder}/{preprocessed_file}')


# Vectorize data
X_3 = vectorizer.fit_transform(conllu_data_3)
# Extract the classes
y_3 = class_3_df['constructive']

In [15]:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3, test_size=0.2, random_state=42)

In [16]:
pred_test_3 = run_naive_bayes(X_train_3, y_train_3, X_test_3, y_test_3)

Accuracy: 0.6818181818181818

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.48      0.57      1086
           1       0.66      0.85      0.75      1312

    accuracy                           0.68      2398
   macro avg       0.70      0.66      0.66      2398
weighted avg       0.69      0.68      0.67      2398



## Test - 4

### Keeping punctuation & stopwords:
- Lemmatitation 
- Remmoving of unusable rows

In [48]:
vectorizer = TfidfVectorizer()
test_4_folder = '3_keep_punctuation_stopwords'

# Load classes from file using pandas
class_4_df = pd.read_csv(
    f'{output_folder}/{test_4_folder}/{classes_file}', 
    sep='\t',
    header=None, 
    names=['doc_id', 'constructive'],
    skiprows=1
)

# Load preprocessed data 
conllu_data_4 = extract_preprocessed_text(f'{output_folder}/{test_4_folder}/{preprocessed_file}')


# Vectorize data
X_4 = vectorizer.fit_transform(conllu_data_4)
# Extract the classes
y_4 = class_4_df['constructive']

In [49]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_4, y_4, test_size=0.2, random_state=42)

In [50]:
pred_test_4 = run_naive_bayes(X_train_4, y_train_4, X_test_4, y_test_4)

Accuracy: 0.6814011676396997

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.48      0.57      1086
           1       0.66      0.85      0.75      1312

    accuracy                           0.68      2398
   macro avg       0.69      0.66      0.66      2398
weighted avg       0.69      0.68      0.67      2398



In [56]:
text_df = pd.DataFrame(conllu_data_1, columns=['text'])
test_df = pd.DataFrame(y_test_1)

# Merge the dataframes by their index
combined_results = pd.concat([text_df, test_df], axis=1, join='inner')

In [57]:
combined_results['test_1'] = pred_test_1
combined_results['test_2'] = pred_test_2
combined_results['test_3'] = pred_test_3
combined_results['test_4'] = pred_test_4

combined_results.shape # 2398 rows

combined_results.sort_index()

Unnamed: 0,text,constructive,test_1,test_2,test_3,test_4
0,and conservative strategy produce angry desper...,1,1,1,1,1
3,do need write essay prove point ? just look fo...,1,1,1,1,1
8,"==============================' ben carr , win...",1,0,0,0,0
10,i be kinda surprise conservative anger generat...,1,1,1,1,1
14,this birtherism ... problem maryam monsef utte...,1,1,1,1,1
...,...,...,...,...,...,...
11947,steven harper see problem here .,0,1,1,1,1
11951,the writer write like professional race compla...,0,0,0,0,0
11952,"tu thanh ha , i asian too . what chretien happ...",0,1,1,1,1
11979,"single marry nice someone companion , friend ,...",0,1,1,1,1


In [60]:
# Get the number of rows where all test results are the same
all_equal = combined_results[
    (combined_results['test_1'] == combined_results['test_2']) &
    (combined_results['test_1'] == combined_results['test_3']) & 
    (combined_results['test_1'] == combined_results['test_4'])
]

# Get the number of rows where all test results are not the same
all_different = combined_results[
    (combined_results['test_1'] != combined_results['test_2']) |
    (combined_results['test_1'] != combined_results['test_3']) | 
    (combined_results['test_1'] != combined_results['test_4'])
]

all_correct_test_1 = all_equal[all_equal['constructive'] == all_equal['test_1']]
all_correct_test_2 = all_equal[all_equal['constructive'] == all_equal['test_2']]
all_correct_test_3 = all_equal[all_equal['constructive'] == all_equal['test_3']]
all_correct_test_4 = all_equal[all_equal['constructive'] == all_equal['test_4']]

all_wrong_test_1 = all_equal[all_equal['constructive'] != all_equal['test_1']]
all_wrong_test_2 = all_equal[all_equal['constructive'] != all_equal['test_2']]
all_wrong_test_3 = all_equal[all_equal['constructive'] != all_equal['test_3']]
all_wrong_test_4 = all_equal[all_equal['constructive'] != all_equal['test_4']]

all_wrong = combined_results[
    (combined_results['constructive'] != combined_results['test_2']) &
    (combined_results['constructive'] != combined_results['test_3']) & 
    (combined_results['constructive'] != combined_results['test_4']) &
    (combined_results['constructive'] != combined_results['test_1'])
]

all_correct = combined_results[
    (combined_results['constructive'] == combined_results['test_2']) &
    (combined_results['constructive'] == combined_results['test_3']) & 
    (combined_results['constructive'] == combined_results['test_4']) &
    (combined_results['constructive'] == combined_results['test_1'])
]

In [67]:
# Get the most frequent words from all correct predictions with label 0 (non-constructive) and 1 (constructive)
non_constructive = all_correct[all_correct['constructive'] == 0]
constructive = all_correct[all_correct['constructive'] == 1]


constructive_text = constructive['text'].str.cat(sep=' ')
# Count the frequency of each word
constructive_text = constructive_text.split()
constructive_text = pd.Series(constructive_text)
# Remove punctuation 
constructive_text = constructive_text.str.replace('[^\w\s]', '')
constructive_text = constructive_text.value_counts()

constructive_text

.                  4757
,                  3990
'                  1168
i                   910
the                 557
                   ... 
smugly                1
parliamentarian       1
abrogate              1
reckone               1
accustom              1
Name: count, Length: 9009, dtype: int64