In [1]:
#cd notebooks/YahooEmbeddings

In [1]:
import spacy
import pandas as pd
import numpy as np
from pathlib import Path
from models import DistributedBagOfWords,MeanPooling,PCA_Projection
from sklearn.metrics import classification_report

In [2]:
# Setup the paths
cwd = Path.cwd()
data_path = cwd / 'data'
# Read the data
classes = pd.read_csv(data_path / 'classes.txt', header = None)
train = pd.read_csv(data_path / 'train.csv', header=None)
train.rename(columns={0: 'label', 1: 'question_title', 2: 'question_content', 3: 'best_answer'}, inplace=True)
train.fillna('', inplace=True)
test = pd.read_csv(data_path / 'test.csv', header=None)
test.fillna('', inplace=True)
test.rename(columns={0: 'label', 1: 'question_title', 2: 'question_content', 3: 'best_answer'}, inplace=True)
# Make the data into X and y
X_train = train.drop('label', axis=1)
y_train = train['label']
X_test = test.drop('label', axis=1)
y_test = test['label']

print(classes)

                        0
0       Society & Culture
1   Science & Mathematics
2                  Health
3   Education & Reference
4    Computers & Internet
5                  Sports
6      Business & Finance
7   Entertainment & Music
8  Family & Relationships
9   Politics & Government


In [3]:
# Concatenate the data to do a summary table of median words per class and data field
X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)
data = pd.concat([X, y], axis=1)
summary_data = data.assign(avg_title = data['question_title'].str.split().str.len(), 
                            avg_content = data['question_content'].str.split().str.len(),
                            avg_answer = data['best_answer'].str.split().str.len()).groupby('label').agg({'avg_title': 'median', 'avg_content': 'median', 'avg_answer': 'median'})
print(summary_data)

       avg_title  avg_content  avg_answer
label                                    
1           10.0         10.0        37.0
2            9.0          0.0        42.0
3           10.0          9.0        43.0
4            9.0          0.0        28.0
5           11.0          8.0        29.0
6           10.0          0.0        24.0
7           10.0          0.0        29.0
8            9.0          6.0        17.0
9           10.0         18.0        34.0
10          11.0          9.0        38.0


In [4]:
%%time
# Perform the preprocessing of the data once.
# Set PROCESS to True if processing need to be runa
PROCESS = False

if PROCESS:
    # Initialize the DBOW class and the MeanPool class that does preprocessing and vectorization
    DBOW = DistributedBagOfWords(lemmatize=True, lowercase=True, remove_stopwords=True)
    MeanPool = MeanPooling(lemmatize=True, lowercase=True, remove_stopwords=True)
    PCA = PCA_Projection(lemmatize=True, lowercase=True, remove_stopwords=True)

    # Run the DBOW on all the data and store it as numpy arrays
    sum_vectors_train = DBOW.transform(X_train)
    sum_vectors_test = DBOW.transform(X_test)
    np.save('sum_vectors_train.npy', sum_vectors_train)
    np.save('sum_vectors_test.npy', sum_vectors_test)

    # Run the MeanPooling on all data and store it as numpy arrays
    mean_vectors_train = MeanPool.transform(X_train)
    mean_vectors_test = MeanPool.transform(X_test)
    np.save('mean_vectors_train.npy', mean_vectors_train)
    np.save('mean_vectors_test.npy', mean_vectors_test)

    # Run the PCA_Projection on all the training data and store it as a numpy array
    pca_vectors_train = PCA.transform(X_train)
    pca_vectors_test = PCA.transform(X_test)
    np.save('pca_vectors_train.npy', pca_vectors_train)
    np.save('pca_vectors_test.npy', pca_vectors_test)
else:
    print('Loading the DBOW vectors')
    sum_vectors_train = np.load('sum_vectors_train.npy')
    sum_vectors_test = np.load('sum_vectors_test.npy')
    print('Loading the MeanPool vectors')
    mean_vectors_train = np.load('mean_vectors_train.npy')
    mean_vectors_test = np.load('mean_vectors_test.npy')
    print('Loading the PCA projected vectors')
    pca_vectors_train = np.load('pca_vectors_train.npy')
    pca_vectors_test = np.load('pca_vectors_test.npy')

Loading the DBOW vectors
Loading the MeanPool vectors


In [5]:
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression models
lr_DBOW = LogisticRegression(max_iter=200, random_state=1337)
lr_MeanPool = LogisticRegression(max_iter=200, random_state=1337)
lr_PCA = LogisticRegression(max_iter=200, random_state=1337)

CPU times: user 16.6 ms, sys: 1.52 ms, total: 18.1 ms
Wall time: 28 ms


In [7]:
%%time
# Fit the DBOW model
lr_DBOW.fit(sum_vectors, y_train)

CPU times: user 1h 9min 59s, sys: 18min 27s, total: 1h 28min 26s
Wall time: 4min 54s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
%%time
# Fit the MeanPool model
lr_MeanPool.fit(mean_vectors, y_train)

In [None]:
%%time
# Fit the PCA model
lr_PCA.fit(pca_vectors, y_train)

In [None]:
from sklearn.svm import SVC

# Initialize linear SVM models
svml_DBOW = SVC(kernel='linear', random_state=1337)
svml_MeanPool = SVC(kernel='linear', random_state=1337)
svml_PCA = SVC(kernel='linear', random_state=1337)

# Initialize radial basis function SVM
svmrbf_DBOW = SVC(kernel='rbf', gamma='scale', random_state=1337)
svmrbf_MeanPool = SVC(kernel='rbf', gamma='scale', random_state=1337)
svmrbf_PCA = SVC(kernel='rbf', gamma='scale', random_state=1337)

In [None]:
%%time
# Fit the DBOW SVM model with linear kernel
svml_DBOW.fit(sum_vectors, y_train)

In [None]:
%%time
# Fit the DBOW SVM model with linear RBF kernel
svmrbf_DBOW.fit(sum_vectors, y_train)

In [None]:
%%time
# Fit the MeanPool SVM model with linear kernel
svml_MeanPool.fit(mean_vectors, y_train)

In [None]:
%%time
# Fit the MeanPool SVM model with RBF kernel
svmrbf_MeanPool.fit(mean_vectors, y_train)

In [None]:
%%time
# Fit the PCA SVM model with linear kernel
svml_PCA.fit(pca_vectors, y_train)

In [None]:
%%time
svmrbf_PCA.fit(pca_vectors, y_train)

In [None]:
from sklearn.neural_network import MLPClassifier

# Initialize the MLP models
mlp_DBOW = MLPClassifier(hidden_layer_sizes=(100, 100, 100), 
                         max_iter=200, activation='relu', 
                         solver='adam', random_state=1337)
mlp_MeanPool = MLPClassifier(hidden_layer_sizes=(100, 100, 100), 
                             max_iter=200, activation='relu', 
                             solver='adam', random_state=1337)
mlp_PCA = MLPClassifier(hidden_layer_sizes=(100, 100, 100), 
                        max_iter=200, activation='relu', 
                        solver='adam', random_state=1337)

In [None]:
%%time
# Fit the DBOW MLP model
mlp_DBOW.fit(sum_vectors, y_train)

In [None]:
%%time
# Fit the MeanPool MLP model
mlp_MeanPool.fit(mean_vectors, y_train)

In [None]:
%time
# Fit the PCA MLP model
mlp_PCA.fit(pca_vectors, y_train)