## Imports & Constants

In [7]:
import pandas as pd

from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.linear_model import LogisticRegression

from dataset_types import ReviewDataSet
from feature_generation import FeatureSetGenerator
from feature_normalization import FeatureSetNormalizer
from evaluation import BinaryClassifierEvaluator, ClassifiersComparator

POSITIVE_REVIEWS_DIR = "./data/pos/"
NEGATIVE_REVIEWS_DIR = "./data/neg/"

## Data

In [3]:
dataset = ReviewDataSet([POSITIVE_REVIEWS_DIR, NEGATIVE_REVIEWS_DIR]).load()

In [4]:
feature_set = FeatureSetGenerator(dataset)\
    .remove_stopwords()\
    .remove_punctuation()\
    .lemmatize()\
    .create_n_grams(1)

In [5]:
normalizer = FeatureSetNormalizer(feature_set)
normalized_feature_set = normalizer.perform_fast_tf_idf()

In [6]:
X_train, y_train, X_dev, y_dev, X_test, y_test = normalized_feature_set.split_into_train_dev_test_sets("polarity", 0.3)

## Logistic Regression

[Tuning Logistic Regression Hyperparameters](https://medium.com/codex/do-i-need-to-tune-logistic-regression-hyperparameters-1cb2b81fca69)

In [6]:
log_regression = LogisticRegression(penalty=None, random_state=42).fit(X_train, y_train)

lr_dev_set_predictions = log_regression.predict(X_dev)
BinaryClassifierEvaluator(y_dev, lr_dev_set_predictions).get_summary()


        Accuracy:  83.8333%
        Precision: 82.8479%
        Recall:    85.3333%
        F1:        84.0722%
        

## Support Vector Machines

[Sklearn SVMs](https://scikit-learn.org/stable/modules/svm.html)
- If the number of features is much greater than the n. of samples, you should avoid overfitting in choosing Kernel functions and regularisation term
    - The size of our feature sets are much greater than the number of samples we have (e.g., 40,000 features per sample, with 4,000 samples)

In [8]:
svm_classes = [SVC, NuSVC, LinearSVC]
svm_performance = ClassifiersComparator([X_train], [y_train]).compare(svm_classes, [X_dev], [y_dev])

In [10]:
svm_performance.sort_values(by="f1")

Unnamed: 0,accuracy,precision,recall,f1
SVC - Set 0,0.815,0.78209,0.873333,0.825197
NuSVC - Set 0,0.823333,0.797546,0.866667,0.830671
LinearSVC - Set 0,0.841667,0.833876,0.853333,0.843493


## Hyperparameter Optimisation