In [1]:
#cd notebooks/YahooEmbeddings

In [1]:
import spacy
import pandas as pd
import numpy as np
from pathlib import Path
from models import DistributedBagOfWords

In [2]:
# Setup the paths
cwd = Path.cwd()
data_path = cwd / 'data'
# Read the data
classes = pd.read_csv(data_path / 'classes.txt')
train = pd.read_csv(data_path / 'train.csv', header=None)
train.rename(columns={0: 'label', 1: 'question_title', 2: 'question_content', 3: 'best_answer'}, inplace=True)
train.fillna('', inplace=True)
test = pd.read_csv(data_path / 'test.csv', header=None)
test.fillna('', inplace=True)
test.rename(columns={0: 'label', 1: 'question_title', 2: 'question_content', 3: 'best_answer'}, inplace=True)
# Make the data into X and y
X_train = train.drop('label', axis=1)
y_train = train['label']
X_test = test.drop('label', axis=1)
y_test = test['label']


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report

# Setup Dummy Pipeline
dummy_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=False)), 
                ('mlp', DummyClassifier(strategy="stratified"))])

# Fit the model
dummy_pipeline.fit(X_train, y_train)

y_pred = dummy_pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

KeyboardInterrupt: 

In [4]:
# Setup Dummy Pipeline
dummy_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=True)), 
                ('mlp', DummyClassifier(strategy="stratified"))])

# Fit the model
dummy_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = dummy_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500]))

              precision    recall  f1-score   support

           1       0.19      0.12      0.15        34
           2       0.15      0.26      0.19        58
           3       0.08      0.07      0.07        29
           4       0.04      0.08      0.05        37
           5       0.14      0.13      0.14        78
           6       0.11      0.19      0.14        36
           7       0.31      0.12      0.18       129
           8       0.08      0.12      0.10        34
           9       0.04      0.03      0.04        31
          10       0.09      0.06      0.07        34

    accuracy                           0.13       500
   macro avg       0.12      0.12      0.11       500
weighted avg       0.16      0.13      0.13       500



In [5]:
from sklearn.linear_model import LogisticRegression

# Setup logistic regression pipeline
logistic_regression_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=False)),
                ('mlp', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000))])

# Fit the model
logistic_regression_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = logistic_regression_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500]))


              precision    recall  f1-score   support

           1       0.60      0.26      0.37        34
           2       0.44      0.67      0.53        58
           3       0.48      0.34      0.40        29
           4       0.20      0.49      0.29        37
           5       0.77      0.74      0.76        78
           6       0.74      0.47      0.58        36
           7       0.47      0.36      0.41       129
           8       0.46      0.50      0.48        34
           9       0.57      0.68      0.62        31
          10       0.41      0.21      0.27        34

    accuracy                           0.48       500
   macro avg       0.51      0.47      0.47       500
weighted avg       0.52      0.48      0.49       500



In [8]:
# Setup logistic regression pipeline
logistic_regression_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=True)),
                ('mlp', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000))])

# Fit the model
logistic_regression_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = logistic_regression_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500], zero_division=0))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        34
           2       0.34      0.81      0.48        58
           3       0.00      0.00      0.00        29
           4       0.17      0.51      0.25        37
           5       0.70      0.82      0.76        78
           6       0.67      0.50      0.57        36
           7       0.54      0.41      0.46       129
           8       0.67      0.53      0.59        34
           9       1.00      0.10      0.18        31
          10       0.00      0.00      0.00        34

    accuracy                           0.44       500
   macro avg       0.41      0.37      0.33       500
weighted avg       0.45      0.44      0.40       500



In [9]:
from sklearn.svm import SVC
# setup SVM pipeline
svm_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=False)),
                ('mlp', SVC(kernel='linear'))])

# Fit the model
svm_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = svm_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500], zero_division=0))

# setup svm pipeline
svm_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=True)),
                ('mlp', SVC(kernel='linear'))])

# Fit the model
svm_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = svm_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500], zero_division=0))

              precision    recall  f1-score   support

           1       0.30      0.18      0.22        34
           2       0.39      0.71      0.50        58
           3       0.50      0.31      0.38        29
           4       0.18      0.41      0.25        37
           5       0.71      0.76      0.73        78
           6       0.74      0.39      0.51        36
           7       0.43      0.27      0.33       129
           8       0.55      0.50      0.52        34
           9       0.53      0.65      0.58        31
          10       0.32      0.21      0.25        34

    accuracy                           0.45       500
   macro avg       0.46      0.44      0.43       500
weighted avg       0.47      0.45      0.44       500

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        34
           2       0.33      0.83      0.48        58
           3       0.00      0.00      0.00        29
           4       0.17 

In [10]:
from sklearn.svm import SVC
# setup SVM pipeline
svm_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=False)),
                ('mlp', SVC(kernel='rbf', gamma='scale'))])

# Fit the model
svm_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = svm_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500], zero_division=0))

# setup svm pipeline
svm_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=True)),
                ('mlp', SVC(kernel='rbf', gamma='scale'))])

# Fit the model
svm_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = svm_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500], zero_division=0))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        34
           2       0.37      0.45      0.41        58
           3       1.00      0.03      0.07        29
           4       0.09      0.78      0.16        37
           5       0.80      0.41      0.54        78
           6       0.71      0.14      0.23        36
           7       0.49      0.16      0.24       129
           8       1.00      0.06      0.11        34
           9       0.33      0.06      0.11        31
          10       0.50      0.03      0.06        34

    accuracy                           0.24       500
   macro avg       0.53      0.21      0.19       500
weighted avg       0.53      0.24      0.24       500

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        34
           2       0.29      0.83      0.43        58
           3       0.88      0.24      0.38        29
           4       0.24 

In [12]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# setup Gaussian Process pipeline
kernel = 1.0 * RBF(1.0)
gaussian_process_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=False)),
                ('mlp', GaussianProcessClassifier(kernel))])

# Fit the model
gaussian_process_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = gaussian_process_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500], zero_division=0))

              precision    recall  f1-score   support

           1       0.50      0.26      0.35        34
           2       0.42      0.69      0.52        58
           3       0.48      0.45      0.46        29
           4       0.17      0.57      0.26        37
           5       0.66      0.81      0.72        78
           6       1.00      0.06      0.11        36
           7       0.49      0.31      0.38       129
           8       0.00      0.00      0.00        34
           9       0.59      0.61      0.60        31
          10       0.27      0.18      0.21        34

    accuracy                           0.43       500
   macro avg       0.46      0.39      0.36       500
weighted avg       0.48      0.43      0.40       500



In [13]:
# setup Gaussian Process pipeline
kernel = 1.0 * RBF(1.0)
gaussian_process_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=True)),
                ('mlp', GaussianProcessClassifier(kernel))])

# Fit the model
gaussian_process_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = gaussian_process_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500], zero_division=0))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        34
           2       0.41      0.71      0.52        58
           3       0.38      0.48      0.42        29
           4       0.24      0.41      0.30        37
           5       0.69      0.85      0.76        78
           6       0.62      0.64      0.63        36
           7       0.55      0.43      0.48       129
           8       0.56      0.59      0.57        34
           9       0.57      0.55      0.56        31
          10       0.00      0.00      0.00        34

    accuracy                           0.50       500
   macro avg       0.40      0.47      0.42       500
weighted avg       0.45      0.50      0.47       500



In [14]:
from sklearn.neural_network import MLPClassifier
# setup mlp pipeline
mlp_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=False)),
                ('mlp', MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=1000, activation='relu', solver='adam', random_state=1337))])    

# Fit the model
mlp_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = mlp_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500], zero_division=0))

              precision    recall  f1-score   support

           1       0.45      0.26      0.33        34
           2       0.39      0.60      0.48        58
           3       0.56      0.34      0.43        29
           4       0.24      0.46      0.31        37
           5       0.70      0.79      0.74        78
           6       0.74      0.56      0.63        36
           7       0.52      0.37      0.43       129
           8       0.52      0.47      0.49        34
           9       0.42      0.58      0.49        31
          10       0.35      0.21      0.26        34

    accuracy                           0.48       500
   macro avg       0.49      0.47      0.46       500
weighted avg       0.51      0.48      0.48       500



In [15]:
# setup mlp pipeline
mlp_pipeline = Pipeline([('vectorizer', DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True, use_mean=True)),
                ('mlp', MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=1000, activation='relu', solver='adam', random_state=1337))])

# Fit the model
mlp_pipeline.fit(X_train[0:500], y_train[0:500])

y_pred = mlp_pipeline.predict(X_test[0:500])

print(classification_report(y_test[0:500], y_pred[0:500], zero_division=0))


              precision    recall  f1-score   support

           1       0.32      0.21      0.25        34
           2       0.45      0.57      0.50        58
           3       0.43      0.55      0.48        29
           4       0.23      0.32      0.27        37
           5       0.72      0.79      0.76        78
           6       0.77      0.56      0.65        36
           7       0.53      0.37      0.44       129
           8       0.36      0.47      0.41        34
           9       0.40      0.52      0.45        31
          10       0.64      0.53      0.58        34

    accuracy                           0.50       500
   macro avg       0.49      0.49      0.48       500
weighted avg       0.51      0.50      0.50       500

