In [1]:
import spacy
import pandas as pd
import numpy as np
from pathlib import Path
from models import DistributedBagOfWords,MeanPooling,PCA_Projection
from sklearn.metrics import classification_report
from utils import preprocess

In [2]:
# Setup the paths
cwd = Path.cwd()
data_path = cwd / 'data'
# Read the data
classes = pd.read_csv(data_path / 'classes.txt', header = None)
train = pd.read_csv(data_path / 'train.csv', header=None)
train.rename(columns={0: 'label', 1: 'question_title', 2: 'question_content', 3: 'best_answer'}, inplace=True)
train.fillna('', inplace=True)
test = pd.read_csv(data_path / 'test.csv', header=None)
test.fillna('', inplace=True)
test.rename(columns={0: 'label', 1: 'question_title', 2: 'question_content', 3: 'best_answer'}, inplace=True)
# Make the data into X and y
X_train = train.drop('label', axis=1)
y_train = train['label']
X_test = test.drop('label', axis=1)
y_test = test['label']

print(classes)

                        0
0       Society & Culture
1   Science & Mathematics
2                  Health
3   Education & Reference
4    Computers & Internet
5                  Sports
6      Business & Finance
7   Entertainment & Music
8  Family & Relationships
9   Politics & Government


In [3]:
# Concatenate the data to do a summary table of median words per class and data field
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)
data = pd.concat([X, y], axis=1)
summary_data = data.assign(avg_title = data['question_title'].str.split().str.len(), 
                            avg_content = data['question_content'].str.split().str.len(),
                            avg_answer = data['best_answer'].str.split().str.len()).groupby('label').agg({'avg_title': 'median', 'avg_content': 'median', 'avg_answer': 'median'})
print(summary_data)

       avg_title  avg_content  avg_answer
label                                    
1           10.0         10.0        37.0
2            9.0          0.0        42.0
3           10.0          9.0        43.0
4            9.0          0.0        28.0
5           11.0          8.0        29.0
6           10.0          0.0        24.0
7           10.0          0.0        29.0
8            9.0          6.0        17.0
9           10.0         18.0        34.0
10          11.0          9.0        38.0


In [3]:
%%time
# Perform the preprocessing of the data once.
# Set load to False if processing need to be run
sum_vectors_train, sum_vectors_test, mean_vectors_train, mean_vectors_test, pca_vectors_train, pca_vectors_test = preprocess(load=True)

Loading the DBOW vectors
Loading the MeanPool vectors
Loading the PCA projected vectors
CPU times: user 0 ns, sys: 2.69 s, total: 2.69 s
Wall time: 6.01 s


In [6]:
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression models
lr_DBOW = LogisticRegression(max_iter=200, random_state=1337)
lr_MeanPool = LogisticRegression(max_iter=200, random_state=1337)
lr_PCA = LogisticRegression(max_iter=200, random_state=1337)

In [6]:
%%time
# Fit the DBOW model
lr_DBOW.fit(sum_vectors_train, y_train)

CPU times: user 2h 19min 12s, sys: 36min 58s, total: 2h 56min 11s
Wall time: 9min 41s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Predict on the test data
y_pred = lr_DBOW.predict(sum_vectors_test)
print(classification_report(y_test, y_pred), zero_division=1)

              precision    recall  f1-score   support

           1       0.61      0.54      0.57      6000
           2       0.69      0.73      0.71      6000
           3       0.74      0.78      0.76      6000
           4       0.54      0.49      0.51      6000
           5       0.81      0.84      0.83      6000
           6       0.84      0.85      0.84      6000
           7       0.58      0.50      0.54      6000
           8       0.64      0.68      0.66      6000
           9       0.68      0.77      0.72      6000
          10       0.75      0.74      0.74      6000

    accuracy                           0.69     60000
   macro avg       0.69      0.69      0.69     60000
weighted avg       0.69      0.69      0.69     60000



In [5]:
%%time
# Fit the MeanPool model
lr_MeanPool.fit(mean_vectors_train, y_train)

CPU times: user 2h 18min 30s, sys: 39min 32s, total: 2h 58min 3s
Wall time: 9min 49s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Predict on the test data
y_pred = lr_MeanPool.predict(mean_vectors_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           1       0.59      0.55      0.57      6000
           2       0.70      0.73      0.71      6000
           3       0.74      0.78      0.76      6000
           4       0.55      0.49      0.52      6000
           5       0.81      0.85      0.83      6000
           6       0.83      0.83      0.83      6000
           7       0.56      0.50      0.53      6000
           8       0.66      0.66      0.66      6000
           9       0.67      0.76      0.71      6000
          10       0.73      0.73      0.73      6000

    accuracy                           0.69     60000
   macro avg       0.68      0.69      0.69     60000
weighted avg       0.68      0.69      0.69     60000



In [7]:
%%time
# Fit the PCA model
lr_PCA.fit(pca_vectors_train, y_train)

CPU times: user 2h 16min 6s, sys: 36min 12s, total: 2h 52min 19s
Wall time: 9min 27s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Predict on the test data
y_pred = lr_PCA.predict(pca_vectors_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           1       0.50      0.43      0.47      6000
           2       0.55      0.57      0.56      6000
           3       0.62      0.63      0.62      6000
           4       0.42      0.38      0.39      6000
           5       0.71      0.68      0.70      6000
           6       0.53      0.59      0.56      6000
           7       0.51      0.40      0.45      6000
           8       0.37      0.43      0.40      6000
           9       0.51      0.62      0.56      6000
          10       0.58      0.54      0.56      6000

    accuracy                           0.53     60000
   macro avg       0.53      0.53      0.53     60000
weighted avg       0.53      0.53      0.53     60000



In [9]:
from sklearn.svm import SVC

# Initialize linear SVM models
svml_DBOW = SVC(kernel='linear', max_iter=500, random_state=1337)
svml_MeanPool = SVC(kernel='linear',max_iter=500, random_state=1337)
svml_PCA = SVC(kernel='linear', max_iter=500, random_state=1337)

# Initialize radial basis function SVM
svmrbf_DBOW = SVC(kernel='rbf', gamma='scale',max_iter=500, random_state=1337)
svmrbf_MeanPool = SVC(kernel='rbf', gamma='scale',max_iter=500, random_state=1337)
svmrbf_PCA = SVC(kernel='rbf', gamma='scale',max_iter=500, random_state=1337)

In [6]:
%%time
# Fit the DBOW SVM model with linear kernel
svml_DBOW.fit(sum_vectors_train, y_train)

CPU times: user 40min 49s, sys: 4.35 s, total: 40min 53s
Wall time: 40min 56s




In [7]:
# Predict on the test data
y_pred = svml_DBOW.predict(sum_vectors_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.46      0.01      0.03      6000
           2       0.62      0.01      0.03      6000
           3       0.56      0.03      0.05      6000
           4       0.18      0.00      0.01      6000
           5       0.50      0.01      0.02      6000
           6       0.75      0.01      0.02      6000
           7       0.15      0.00      0.01      6000
           8       0.28      0.02      0.03      6000
           9       0.27      0.04      0.07      6000
          10       0.10      0.99      0.19      6000

    accuracy                           0.11     60000
   macro avg       0.39      0.11      0.04     60000
weighted avg       0.39      0.11      0.04     60000



In [8]:
%%time
# Fit the DBOW SVM model with linear RBF kernel
svmrbf_DBOW.fit(sum_vectors_train, y_train)

CPU times: user 1h 16min 17s, sys: 2.04 s, total: 1h 16min 19s
Wall time: 1h 16min 19s




In [10]:
# Predict on the test data
y_pred = svmrbf_DBOW.predict(sum_vectors_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           1       0.12      0.37      0.18      6000
           2       0.24      0.07      0.11      6000
           3       0.56      0.02      0.04      6000
           4       1.00      0.00      0.00      6000
           5       1.00      0.00      0.00      6000
           6       0.56      0.07      0.12      6000
           7       1.00      0.00      0.00      6000
           8       0.39      0.02      0.04      6000
           9       0.32      0.02      0.04      6000
          10       0.10      0.61      0.17      6000

    accuracy                           0.12     60000
   macro avg       0.53      0.12      0.07     60000
weighted avg       0.53      0.12      0.07     60000



In [5]:
%%time
# Fit the MeanPool SVM model with linear kernel
svml_MeanPool.fit(mean_vectors_train, y_train)

CPU times: user 1h 15min 32s, sys: 6.76 s, total: 1h 15min 39s
Wall time: 1h 15min 41s




In [9]:
# Predict on the test data
y_pred = svml_MeanPool.predict(mean_vectors_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           1       0.51      0.03      0.06      6000
           2       0.65      0.15      0.24      6000
           3       0.74      0.18      0.29      6000
           4       0.32      0.15      0.20      6000
           5       0.89      0.27      0.42      6000
           6       0.87      0.22      0.35      6000
           7       0.18      0.07      0.10      6000
           8       0.24      0.49      0.33      6000
           9       0.26      0.61      0.36      6000
          10       0.22      0.79      0.34      6000

    accuracy                           0.30     60000
   macro avg       0.49      0.30      0.27     60000
weighted avg       0.49      0.30      0.27     60000



In [7]:
%%time
# Fit the MeanPool SVM model with RBF kernel
svmrbf_MeanPool.fit(mean_vectors_train, y_train)

CPU times: user 1h 17min 59s, sys: 5.16 s, total: 1h 18min 4s
Wall time: 1h 18min 4s




In [10]:
# Predict on the test data
y_pred = svmrbf_MeanPool.predict(mean_vectors_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           1       0.13      0.27      0.17      6000
           2       0.56      0.16      0.25      6000
           3       0.60      0.28      0.38      6000
           4       0.21      0.17      0.19      6000
           5       0.58      0.42      0.49      6000
           6       0.82      0.27      0.41      6000
           7       0.34      0.05      0.08      6000
           8       0.23      0.42      0.30      6000
           9       0.42      0.36      0.39      6000
          10       0.25      0.61      0.35      6000

    accuracy                           0.30     60000
   macro avg       0.41      0.30      0.30     60000
weighted avg       0.41      0.30      0.30     60000



In [10]:
%%time
# Fit the PCA SVM model with linear kernel
svml_PCA.fit(pca_vectors_train, y_train)

CPU times: user 45min 24s, sys: 5.85 s, total: 45min 30s
Wall time: 45min 30s




In [11]:
# Predict on the test data
y_pred = svml_PCA.predict(pca_vectors_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           1       0.56      0.01      0.01      6000
           2       0.20      0.01      0.01      6000
           3       0.34      0.18      0.24      6000
           4       0.17      0.00      0.00      6000
           5       0.35      0.06      0.10      6000
           6       0.65      0.01      0.01      6000
           7       0.11      0.04      0.06      6000
           8       0.16      0.04      0.06      6000
           9       0.18      0.44      0.26      6000
          10       0.12      0.73      0.20      6000

    accuracy                           0.15     60000
   macro avg       0.28      0.15      0.09     60000
weighted avg       0.28      0.15      0.09     60000



In [12]:
%%time
svmrbf_PCA.fit(pca_vectors_train, y_train)

CPU times: user 1h 16min 51s, sys: 4.07 s, total: 1h 16min 55s
Wall time: 1h 16min 55s




In [13]:
# Predict on the test data
y_pred = svmrbf_PCA.predict(pca_vectors_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           1       0.12      0.47      0.19      6000
           2       0.32      0.08      0.13      6000
           3       0.32      0.11      0.16      6000
           4       0.17      0.03      0.06      6000
           5       0.41      0.12      0.18      6000
           6       0.35      0.08      0.13      6000
           7       0.14      0.04      0.06      6000
           8       0.24      0.08      0.12      6000
           9       0.18      0.16      0.17      6000
          10       0.12      0.39      0.18      6000

    accuracy                           0.16     60000
   macro avg       0.24      0.16      0.14     60000
weighted avg       0.24      0.16      0.14     60000



In [14]:
from sklearn.neural_network import MLPClassifier

# Initialize the MLP models
mlp_DBOW = MLPClassifier(hidden_layer_sizes=(100, 100, 100), 
                         max_iter=200, activation='relu', 
                         solver='adam', random_state=1337)
mlp_MeanPool = MLPClassifier(hidden_layer_sizes=(100, 100, 100), 
                             max_iter=200, activation='relu', 
                             solver='adam', random_state=1337)
mlp_PCA = MLPClassifier(hidden_layer_sizes=(100, 100, 100), 
                        max_iter=200, activation='relu', 
                        solver='adam', random_state=1337)

In [12]:
%%time
# Fit the DBOW MLP model
mlp_DBOW.fit(sum_vectors_train, y_train)

CPU times: user 6h 55min 57s, sys: 14h 17min 1s, total: 21h 12min 58s
Wall time: 1h 7min 9s




In [13]:
# Predict on the test data
y_pred = mlp_DBOW.predict(sum_vectors_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           1       0.64      0.56      0.59      6000
           2       0.72      0.77      0.74      6000
           3       0.76      0.80      0.78      6000
           4       0.58      0.51      0.54      6000
           5       0.83      0.86      0.84      6000
           6       0.86      0.87      0.86      6000
           7       0.62      0.50      0.55      6000
           8       0.69      0.70      0.69      6000
           9       0.69      0.79      0.74      6000
          10       0.73      0.80      0.76      6000

    accuracy                           0.72     60000
   macro avg       0.71      0.72      0.71     60000
weighted avg       0.71      0.72      0.71     60000



In [6]:
%%time
# Fit the MeanPool MLP model
mlp_MeanPool.fit(mean_vectors_train, y_train)

CPU times: user 9h 18min 5s, sys: 22h 35s, total: 1d 7h 18min 41s
Wall time: 1h 39min 7s




In [7]:
# Predict on the test data
y_pred = mlp_MeanPool.predict(mean_vectors_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           1       0.63      0.55      0.59      6000
           2       0.70      0.76      0.73      6000
           3       0.75      0.79      0.77      6000
           4       0.57      0.51      0.53      6000
           5       0.81      0.86      0.84      6000
           6       0.87      0.85      0.86      6000
           7       0.62      0.48      0.54      6000
           8       0.68      0.69      0.69      6000
           9       0.67      0.79      0.73      6000
          10       0.73      0.79      0.75      6000

    accuracy                           0.71     60000
   macro avg       0.70      0.71      0.70     60000
weighted avg       0.70      0.71      0.70     60000



In [15]:
%time
# Fit the PCA MLP model
mlp_PCA.fit(pca_vectors_train, y_train)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs




In [16]:
# Predict on the test data
y_pred = mlp_PCA.predict(pca_vectors_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           1       0.60      0.52      0.56      6000
           2       0.70      0.72      0.71      6000
           3       0.73      0.78      0.76      6000
           4       0.54      0.48      0.51      6000
           5       0.81      0.84      0.82      6000
           6       0.78      0.82      0.80      6000
           7       0.59      0.47      0.52      6000
           8       0.62      0.60      0.61      6000
           9       0.66      0.77      0.71      6000
          10       0.69      0.76      0.72      6000

    accuracy                           0.68     60000
   macro avg       0.67      0.68      0.67     60000
weighted avg       0.67      0.68      0.67     60000

