## Imports & Constants

In [38]:
import pandas as pd

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV

from dataset_types import ReviewDataSet
from feature_generation import FeatureSetGenerator
from feature_normalization import FeatureSetNormalizer
from evaluation import BinaryClassifierEvaluator, ClassifierComparator, FeatureSetComparator

POSITIVE_REVIEWS_DIR = "./data/pos/"
NEGATIVE_REVIEWS_DIR = "./data/neg/"

## Data

In [39]:
dataset = ReviewDataSet([POSITIVE_REVIEWS_DIR, NEGATIVE_REVIEWS_DIR]).load()

In [40]:
feature_set_a = FeatureSetGenerator(dataset)\
    .stem()\
    .remove_punctuation()\
    .remove_stopwords()\
    .create_n_grams(1)

feature_set_b = FeatureSetGenerator(dataset)\
    .stem()\
    .remove_punctuation()\
    .remove_stopwords()\
    .create_n_grams(1)

feature_set_c = FeatureSetGenerator(dataset)\
    .stem()\
    .remove_punctuation()\
    .remove_stopwords()\
    .create_everygrams(2)

norm_feature_set_a = FeatureSetNormalizer(feature_set_a).perform_tf_norm()
norm_feature_set_b = FeatureSetNormalizer(feature_set_b).perform_tf_idf_norm()
norm_feature_set_c = FeatureSetNormalizer(feature_set_c).perform_tf_idf_norm()

# Freeing some memory.
del feature_set_a
del feature_set_b
del feature_set_c

In [41]:
X_train_a, y_train_a, X_dev_a, y_dev_a, X_test_a, y_test_a = norm_feature_set_a.split_into_train_dev_test_arrays("polarity", 0.3)
X_train_b, y_train_b, X_dev_b, y_dev_b, X_test_b, y_test_b = norm_feature_set_b.split_into_train_dev_test_arrays("polarity", 0.3)
X_train_c, y_train_c, X_dev_c, y_dev_c, X_test_c, y_test_c = norm_feature_set_c.split_into_train_dev_test_arrays("polarity", 0.3)

# Freeing some memeory.
del norm_feature_set_a
del norm_feature_set_b
del norm_feature_set_c

## Logistic Regression

[Tuning Logistic Regression Hyperparameters](https://medium.com/codex/do-i-need-to-tune-logistic-regression-hyperparameters-1cb2b81fca69)

In [5]:
lr_default_hyperparams = {
    "loss": "log_loss",
    "random_state": 42,
}

### Comparing performance across feature sets

In [6]:
comparator = FeatureSetComparator(
    [X_train_a, X_train_b, X_train_c],
    [y_train_a, y_train_b, y_train_c]
)

log_regression_performance = comparator.compare(
    SGDClassifier,
    [X_dev_a, X_dev_b, X_dev_c],
    [y_dev_a, y_dev_b, y_dev_c],
    lr_default_hyperparams
)

log_regression_performance

Unnamed: 0,accuracy,precision,recall,f1
SGDClassifier - Set A,0.75,0.715517,0.83,0.768519
SGDClassifier - Set B,0.821667,0.88755,0.736667,0.8051
SGDClassifier - Set C,0.846667,0.798851,0.926667,0.858025


### Evaluating the model on the test split of the best feature set

In [7]:
classifier = SGDClassifier(**lr_default_hyperparams)
classifier.fit(X_train_c, y_train_c)

test_predictions = classifier.predict(X_test_c)
test_performance = BinaryClassifierEvaluator(y_test_c, test_predictions).get_summary().as_df()

test_performance.index = [f"LogisticRegression - Test Set C"]
test_performance

Unnamed: 0,accuracy,precision,recall,f1
LogisticRegression - Test Set C,0.841667,0.797101,0.916667,0.852713


## Support Vector Machines

[Sklearn SVMs](https://scikit-learn.org/stable/modules/svm.html)
- If the number of features is much greater than the n. of samples, you should avoid overfitting in choosing Kernel functions and regularisation term
    - The size of our feature sets are much greater than the number of samples we have (e.g., 40,000 features per sample, with 4,000 samples)

In [6]:
svm_default_hyperparams = {
    "loss": "hinge",
    "random_state": 42,
}

### Comparing performance across feature sets

In [9]:
comparator = FeatureSetComparator(
    [X_train_a, X_train_b, X_train_c],
    [y_train_a, y_train_b, y_train_c]
)

svm_performance = comparator.compare(
    SGDClassifier,
    [X_dev_a, X_dev_b, X_dev_c],
    [y_dev_a, y_dev_b, y_dev_c],
    svm_default_hyperparams
)

svm_performance

Unnamed: 0,accuracy,precision,recall,f1
SGDClassifier - Set A,0.745,0.688946,0.893333,0.777939
SGDClassifier - Set B,0.865,0.830816,0.916667,0.871632
SGDClassifier - Set C,0.86,0.823353,0.916667,0.867508


### Evaluating the model on the test split of the best feature set

In [43]:
classifier = SGDClassifier(**svm_default_hyperparams)
classifier.fit(X_train_b, y_train_b)

test_predictions = classifier.predict(X_test_b)
test_performance = BinaryClassifierEvaluator(y_test_b, test_predictions).get_summary().as_df()

test_performance.index = [f"SVM - Test Set C"]
test_performance

Unnamed: 0,accuracy,precision,recall,f1
SVM - Test Set C,0.845,0.848485,0.84,0.844221


# Hyperparameter Optimisation

[Tuning Hyperparameters - Sklearn](https://scikit-learn.org/stable/modules/grid_search.html#grid-search)  
- [Example tuning pipeline](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-plot-grid-search-text-feature-extraction-py)

In [27]:
param_grid = {
    "penalty": ["l2", "l1", "elasticnet", None],
    "alpha": [1e-5, 1e-4, 1e-3],
    "max_iter": [750, 1000, 1250, 1500],
    "learning_rate": ["optimal", "adaptive"],
    "eta0": [1e-3, 1e-2],
    "early_stopping": [False, True],
    "n_iter_no_change": [10],
}

## Logstic Regression

Performing hyperparameter tuning using feature set C.

In [28]:
lr_random_search = RandomizedSearchCV(
    estimator=SGDClassifier(**lr_default_hyperparams),
    param_distributions=param_grid,
    scoring="f1",
    n_iter=15,
    n_jobs=8,
    verbose=1,
    random_state=42,
)

print("Performing grid search...")
print(f"Hyperparameters to be evaluated:\n{param_grid}")
lr_random_search.fit(X_train_c, y_train_c)

Performing grid search...
Hyperparameters to be evaluated:
{'penalty': ['l2', 'l1', 'elasticnet', None], 'alpha': [1e-05, 0.0001, 0.001], 'max_iter': [750, 1000, 1250, 1500], 'learning_rate': ['optimal', 'adaptive'], 'eta0': [0.001, 0.01], 'early_stopping': [False, True], 'n_iter_no_change': [10]}
Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [29]:
print("Best parameters combination found:")
lr_best_parameters = lr_random_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"{param_name}: {lr_best_parameters[param_name]}")

Best parameters combination found:
alpha: 0.0001
early_stopping: True
eta0: 0.001
learning_rate: optimal
max_iter: 750
n_iter_no_change: 10
penalty: None


In [30]:
lr_cv_results = pd.DataFrame(lr_random_search.cv_results_)
lr_cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_n_iter_no_change,param_max_iter,param_learning_rate,param_eta0,param_early_stopping,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,58.726838,8.593135,0.344675,0.028491,l2,10,1500,optimal,0.001,False,0.001,"{'penalty': 'l2', 'n_iter_no_change': 10, 'max...",0.669856,0.0,0.482385,0.041958,0.721354,0.383111,0.306479,13
1,431.463642,39.075963,0.278482,0.031253,elasticnet,10,1250,adaptive,0.01,True,0.0001,"{'penalty': 'elasticnet', 'n_iter_no_change': ...",0.836625,0.84,0.841739,0.82821,0.836667,0.836648,0.004655,8
2,49.164221,13.361773,0.353077,0.018224,,10,750,optimal,0.01,True,0.001,"{'penalty': None, 'n_iter_no_change': 10, 'max...",0.328358,0.840764,0.843854,0.700665,0.719072,0.686543,0.188725,11
3,41.140368,3.317014,0.339886,0.019162,l2,10,1500,optimal,0.001,True,0.001,"{'penalty': 'l2', 'n_iter_no_change': 10, 'max...",0.021201,0.0,0.472826,0.852792,0.0,0.269364,0.34307,14
4,204.800999,14.649867,0.323597,0.038082,l2,10,1250,adaptive,0.01,False,1e-05,"{'penalty': 'l2', 'n_iter_no_change': 10, 'max...",0.844523,0.841155,0.849741,0.84492,0.823315,0.840731,0.009128,7
5,232.351914,21.213332,0.296727,0.01663,l2,10,1000,adaptive,0.01,True,0.001,"{'penalty': 'l2', 'n_iter_no_change': 10, 'max...",0.842857,0.837638,0.852792,0.841155,0.834188,0.841726,0.006286,4
6,274.472172,12.788324,0.255004,0.039421,l1,10,750,optimal,0.01,False,1e-05,"{'penalty': 'l1', 'n_iter_no_change': 10, 'max...",0.84803,0.859107,0.865832,0.851789,0.782435,0.841438,0.03013,6
7,799.085549,73.635267,0.352981,0.053091,elasticnet,10,750,adaptive,0.01,True,1e-05,"{'penalty': 'elasticnet', 'n_iter_no_change': ...",0.838129,0.842491,0.85567,0.840787,0.83391,0.842198,0.007332,2
8,188.285124,20.008912,0.265301,0.031584,,10,1000,adaptive,0.01,True,0.001,"{'penalty': None, 'n_iter_no_change': 10, 'max...",0.83964,0.84,0.854237,0.845324,0.829268,0.841694,0.008149,5
9,81.693666,19.97324,0.296929,0.022065,elasticnet,10,1500,optimal,0.01,True,0.0001,"{'penalty': 'elasticnet', 'n_iter_no_change': ...",0.028169,0.804185,0.175896,0.412256,0.61165,0.406431,0.281633,12


### Evaluating the model on a test split using the best hyperparameters

In [42]:
classifier = SGDClassifier(**lr_best_parameters)
classifier.fit(X_train_c, y_train_c)

test_predictions = classifier.predict(X_test_c)
test_performance = BinaryClassifierEvaluator(y_test_c, test_predictions).get_summary().as_df()

test_performance.index = [f"LogisticRegression - Test Set C"]
test_performance

Unnamed: 0,accuracy,precision,recall,f1
LogisticRegression - Test Set C,0.825,0.81759,0.836667,0.827018


## SVM

Performing hyperparameter tuning using feature set B.

In [31]:
svm_random_search = RandomizedSearchCV(
    estimator=SGDClassifier(**svm_default_hyperparams),
    param_distributions=param_grid,
    scoring="f1",
    n_iter=15,
    n_jobs=8,
    verbose=1,
    random_state=42,
)

print("Performing grid search...")
print(f"Hyperparameters to be evaluated:\n{param_grid}")
svm_random_search.fit(X_train_b, y_train_b)

Performing grid search...
Hyperparameters to be evaluated:
{'penalty': ['l2', 'l1', 'elasticnet', None], 'alpha': [1e-05, 0.0001, 0.001], 'max_iter': [750, 1000, 1250, 1500], 'learning_rate': ['optimal', 'adaptive'], 'eta0': [0.001, 0.01], 'early_stopping': [False, True], 'n_iter_no_change': [10]}
Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [32]:
print("Best parameters combination found:")
svm_best_parameters = svm_random_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print(f"{param_name}: {svm_best_parameters[param_name]}")

Best parameters combination found:
alpha: 1e-05
early_stopping: False
eta0: 0.01
learning_rate: adaptive
max_iter: 1250
n_iter_no_change: 10
penalty: l2


In [33]:
svm_cv_results = pd.DataFrame(svm_random_search.cv_results_)
svm_cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_n_iter_no_change,param_max_iter,param_learning_rate,param_eta0,param_early_stopping,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,3.987504,0.287605,0.020133,0.003259,l2,10,1500,optimal,0.001,False,0.001,"{'penalty': 'l2', 'n_iter_no_change': 10, 'max...",0.758333,0.797619,0.711563,0.803519,0.76386,0.766979,0.032953,13
1,14.297116,0.245966,0.019821,0.002847,elasticnet,10,1250,adaptive,0.01,True,0.0001,"{'penalty': 'elasticnet', 'n_iter_no_change': ...",0.815385,0.818533,0.847584,0.818868,0.800745,0.820223,0.015206,10
2,3.018904,0.633154,0.019374,0.002328,,10,750,optimal,0.01,True,0.001,"{'penalty': None, 'n_iter_no_change': 10, 'max...",0.849408,0.845361,0.844371,0.80695,0.820976,0.833413,0.016575,4
3,2.100173,0.256342,0.017657,0.003562,l2,10,1500,optimal,0.001,True,0.001,"{'penalty': 'l2', 'n_iter_no_change': 10, 'max...",0.686499,0.0,0.007117,0.007117,0.028169,0.145781,0.270524,15
4,45.898607,1.583842,0.017752,0.001213,l2,10,1250,adaptive,0.01,False,1e-05,"{'penalty': 'l2', 'n_iter_no_change': 10, 'max...",0.864376,0.85192,0.858144,0.846018,0.84063,0.852217,0.008435,1
5,8.708376,0.199134,0.021268,0.002243,l2,10,1000,adaptive,0.01,True,0.001,"{'penalty': 'l2', 'n_iter_no_change': 10, 'max...",0.819923,0.824663,0.839187,0.823748,0.809701,0.823444,0.009494,6
6,7.286571,2.288648,0.021885,0.001448,l1,10,750,optimal,0.01,False,1e-05,"{'penalty': 'l1', 'n_iter_no_change': 10, 'max...",0.792899,0.852518,0.847636,0.840787,0.828423,0.832453,0.02137,5
7,14.343902,0.561908,0.018949,0.001974,elasticnet,10,750,adaptive,0.01,True,1e-05,"{'penalty': 'elasticnet', 'n_iter_no_change': ...",0.819923,0.824663,0.840741,0.823748,0.807477,0.82331,0.010658,8
8,9.057599,0.184997,0.017138,0.002055,,10,1000,adaptive,0.01,True,0.001,"{'penalty': None, 'n_iter_no_change': 10, 'max...",0.819923,0.824663,0.839187,0.823748,0.809701,0.823444,0.009494,6
9,3.296074,0.417956,0.015997,0.002816,elasticnet,10,1500,optimal,0.01,True,0.0001,"{'penalty': 'elasticnet', 'n_iter_no_change': ...",0.823708,0.847291,0.807867,0.668213,0.83705,0.796826,0.065651,12


### Evaluating the model on a test split using the best hyperparameters

In [44]:
classifier = SGDClassifier(**svm_best_parameters)
classifier.fit(X_train_b, y_train_b)

test_predictions = classifier.predict(X_test_b)
test_performance = BinaryClassifierEvaluator(y_test_b, test_predictions).get_summary().as_df()

test_performance.index = [f"SVM - Test Set C"]
test_performance

Unnamed: 0,accuracy,precision,recall,f1
SVM - Test Set C,0.843333,0.832258,0.86,0.845902


# Freeing memory

In [34]:
del X_train_a
del y_train_a
del X_dev_a
del y_dev_a
del X_test_a
del y_test_a


del X_train_b
del y_train_b
del X_dev_b
del y_dev_b
del X_test_b
del y_test_b 


del X_train_c
del y_train_c
del X_dev_c
del y_dev_c
del X_test_c
del y_test_c