## Imports & Constants

In [1]:
from sklearn.linear_model import LogisticRegression

from dataset_types import ReviewDataSet
from evaluation import BinaryClassifierEvaluator
from feature_generation import FeatureSetGenerator
from feature_normalization import FeatureSetNormalizer

POSITIVE_REVIEWS_DIR = "./data/pos/"
NEGATIVE_REVIEWS_DIR = "./data/neg/"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\wij21\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Data

In [2]:
dataset = ReviewDataSet([POSITIVE_REVIEWS_DIR, NEGATIVE_REVIEWS_DIR]).load()

In [3]:
feature_set = FeatureSetGenerator(dataset)\
    .remove_stopwords()\
    .remove_punctuation()\
    .lemmatize()\
    .create_n_grams(1)

In [4]:
normalizer = FeatureSetNormalizer(feature_set)
normalized_feature_set = normalizer.perform_fast_tf_idf()

In [22]:
X_train, y_train, X_dev, y_dev, X_test, y_test = normalized_feature_set.split_into_train_dev_test_sets("polarity", 0.3)

## Logistic Regression

[Tuning Logistic Regression Hyperparameters](https://medium.com/codex/do-i-need-to-tune-logistic-regression-hyperparameters-1cb2b81fca69)

In [35]:
log_regression = LogisticRegression(penalty=None, random_state=42).fit(X_train, y_train)
lr_dev_set_predictions = log_regression.predict(X_dev)

BinaryClassifierEvaluator(y_dev, lr_dev_set_predictions).get_summary()


        Accuracy:  83.8333%
        Precision: 82.8479%
        Recall:    85.3333%
        F1:        84.0722%
        

## Support Vector Machines