## Imports & Constants

In [1]:
from sklearn.linear_model import SGDClassifier

from dataset_types import ReviewDataSet
from feature_generation import FeatureSetGenerator
from feature_normalization import FeatureSetNormalizer
from evaluation import BinaryClassifierEvaluator, ClassifierComparator, FeatureSetComparator

POSITIVE_REVIEWS_DIR = "./data/pos/"
NEGATIVE_REVIEWS_DIR = "./data/neg/"

[nltk_data] Downloading package punkt to /home/sowell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sowell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sowell/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/sowell/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Data

In [2]:
dataset = ReviewDataSet([POSITIVE_REVIEWS_DIR, NEGATIVE_REVIEWS_DIR]).load()

In [3]:
feature_set_a = FeatureSetGenerator(dataset)\
    .stem()\
    .remove_punctuation()\
    .remove_stopwords()\
    .create_n_grams(1)

feature_set_b = FeatureSetGenerator(dataset)\
    .stem()\
    .remove_punctuation()\
    .remove_stopwords()\
    .create_n_grams(1)

feature_set_c = FeatureSetGenerator(dataset)\
    .stem()\
    .remove_punctuation()\
    .remove_stopwords()\
    .create_everygrams(2)

norm_feature_set_a = FeatureSetNormalizer(feature_set_a).perform_tf_norm()
norm_feature_set_b = FeatureSetNormalizer(feature_set_b).perform_tf_idf_norm()
norm_feature_set_c = FeatureSetNormalizer(feature_set_c).perform_tf_idf_norm()

# Freeing some memory.
del feature_set_a
del feature_set_b
del feature_set_c

In [4]:
X_train_a, y_train_a, X_dev_a, y_dev_a, X_test_a, y_test_a = norm_feature_set_a.split_into_train_dev_test_arrays("polarity", 0.3)
X_train_b, y_train_b, X_dev_b, y_dev_b, X_test_b, y_test_b = norm_feature_set_b.split_into_train_dev_test_arrays("polarity", 0.3)
X_train_c, y_train_c, X_dev_c, y_dev_c, X_test_c, y_test_c = norm_feature_set_c.split_into_train_dev_test_arrays("polarity", 0.3)

# Freeing some memeory.
del norm_feature_set_a
del norm_feature_set_b
del norm_feature_set_c

## Logistic Regression

[Tuning Logistic Regression Hyperparameters](https://medium.com/codex/do-i-need-to-tune-logistic-regression-hyperparameters-1cb2b81fca69)

In [5]:
lr_default_hyperparams = {
    "loss": "log_loss",
    "random_state": 42,
}

### Comparing performance across feature sets

In [6]:
comparator = FeatureSetComparator(
    [X_train_a, X_train_b, X_train_c],
    [y_train_a, y_train_b, y_train_c]
)

log_regression_performance = comparator.compare(
    SGDClassifier,
    [X_dev_a, X_dev_b, X_dev_c],
    [y_dev_a, y_dev_b, y_dev_c],
    lr_default_hyperparams
)

log_regression_performance

Unnamed: 0,accuracy,precision,recall,f1
SGDClassifier - Set A,0.75,0.715517,0.83,0.768519
SGDClassifier - Set B,0.821667,0.88755,0.736667,0.8051
SGDClassifier - Set C,0.846667,0.798851,0.926667,0.858025


### Evaluating the model on the test split of the best feature set

In [7]:
classifier = SGDClassifier(**lr_default_hyperparams)
classifier.fit(X_train_c, y_train_c)

test_predictions = classifier.predict(X_test_c)
test_performance = BinaryClassifierEvaluator(y_test_c, test_predictions).get_summary().as_df()

test_performance.index = [f"LogisticRegression - Test Set C"]
test_performance

Unnamed: 0,accuracy,precision,recall,f1
LogisticRegression - Test Set C,0.841667,0.797101,0.916667,0.852713


## Support Vector Machines

[Sklearn SVMs](https://scikit-learn.org/stable/modules/svm.html)
- If the number of features is much greater than the n. of samples, you should avoid overfitting in choosing Kernel functions and regularisation term
    - The size of our feature sets are much greater than the number of samples we have (e.g., 40,000 features per sample, with 4,000 samples)

In [8]:
svm_default_hyperparams = {
    "loss": "hinge",
    "random_state": 42,
}

### Comparing performance across feature sets

In [9]:
comparator = FeatureSetComparator(
    [X_train_a, X_train_b, X_train_c],
    [y_train_a, y_train_b, y_train_c]
)

svm_performance = comparator.compare(
    SGDClassifier,
    [X_dev_a, X_dev_b, X_dev_c],
    [y_dev_a, y_dev_b, y_dev_c],
    svm_default_hyperparams
)

svm_performance

Unnamed: 0,accuracy,precision,recall,f1
SGDClassifier - Set A,0.745,0.688946,0.893333,0.777939
SGDClassifier - Set B,0.865,0.830816,0.916667,0.871632
SGDClassifier - Set C,0.86,0.823353,0.916667,0.867508


### Evaluating the model on the test split of the best feature set

In [11]:
classifier = SGDClassifier(**svm_default_hyperparams)
classifier.fit(X_train_c, y_train_c)

test_predictions = classifier.predict(X_test_c)
test_performance = BinaryClassifierEvaluator(y_test_c, test_predictions).get_summary().as_df()

test_performance.index = [f"SVM - Test Set C"]
test_performance

Unnamed: 0,accuracy,precision,recall,f1
SVM - Test Set C,0.86,0.821429,0.92,0.867925


# Hyperparameter Optimisation

[GridSearchCV](https://scikit-learn.org/stable/modules/grid_search.html#grid-search)
- https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-plot-grid-search-text-feature-extraction-py

In [20]:
from sklearn.model_selection import RandomizedSearchCV

## Logstic Regression

In [21]:
param_grid = {
    "penalty": ["l2", "l1", "elasticnet", None],
    "alpha": [0., 1e-5, 1e-4, 1e-3],
    "max_iter": [500, 1000, 1500],
    "learning_rate": ["optimal", "invscaling"],
    "early_stopping": [False, True],
    "eta0": [0, 1e-4, 1e-2],
    "n_iter_no_change": [10],
}

In [24]:
grid_search = RandomizedSearchCV(
    estimator=SGDClassifier(**lr_default_hyperparams),
    param_distributions=param_grid,
    scoring="f1",
    n_iter=20,
    n_jobs=4,
    verbose=1
    random_state=42,
)

print("Performing grid search...")
print(f"Hyperparameters to be evaluated:\n{param_grid}")
grid_search.fit(X_train_c, y_train_c)

Performing grid search...
Hyperparameters to be evaluated:
{'penalty': ['l2', 'l1', 'elasticnet', None], 'alpha': [0.0, 1e-05, 0.0001, 0.001], 'max_iter': [500, 1000, 1500, 2000], 'early_stopping': [False, True], 'n_iter_no_change': [10]}
Fitting 5 folds for each of 20 candidates, totalling 100 fits


KeyboardInterrupt: 

### Evaluating the model on a test split using the best hyperparameters

## SVM

### Evaluating the model on a test split using the best hyperparameters