In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from utils import config, utils

In [2]:
df, df_cv = utils.Utils.load_data(drop_neutral=False)

## Random Forest

In [3]:
nltk_ngram_rf_1 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET,
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=False, lemmatize=False, remove_urls=False)),
                    ('tfidf', (TfidfVectorizer(ngram_range=(1, 3)))),
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", SVC(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_n_gram_rf_1"
)

In [4]:
# nltk_ngram_rf_1.fit(False)
# nltk_ngram_rf_1.evaluate(False)

In [None]:
nltk_ngram_rf_1.fit(True)
nltk_ngram_rf_1.evaluate(True)

In [6]:
nltk_ngram_rf_2 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET,
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=True, lemmatize=False, remove_urls=True)),
                    ('tfidf', (TfidfVectorizer(ngram_range=(1, 3)))),
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", SVC(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_n_gram_rf_2"
)

In [7]:
# nltk_ngram_rf_2.fit(False)
# nltk_ngram_rf_2.evaluate(False)

In [None]:
nltk_ngram_rf_2.fit(True)
nltk_ngram_rf_2.evaluate(True)

In [12]:
nltk_ngram_rf_3 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET,
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=False, lemmatize=True, remove_urls=True)),
                    ('tfidf', (TfidfVectorizer(ngram_range=(1, 3)))),
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", SVC(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_n_gram_rf_3"
)

In [10]:
# nltk_ngram_rf_3.fit(False)
# nltk_ngram_rf_3.evaluate(False)

In [None]:
nltk_ngram_rf_3.fit(True)
nltk_ngram_rf_3.evaluate(True)

## Support Vector Machine

In [None]:
nltk_ngram_svm_1 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET,
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=False, lemmatize=False, remove_urls=False)),
                    ('tfidf', (TfidfVectorizer(ngram_range=(1, 3)))),
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", SVC(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_n_gram_rf_1"
)

In [None]:
# nltk_ngram_svm_1.fit(False)
# nltk_ngram_svm_1.evaluate(False)

In [None]:
nltk_ngram_svm_1.fit(True)
nltk_ngram_svm_1.evaluate(True)

In [None]:
nltk_ngram_svm_2 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET,
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=True, lemmatize=False, remove_urls=True)),
                    ('tfidf', (TfidfVectorizer(ngram_range=(1, 3)))),
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", SVC(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_n_gram_rf_2"
)

In [None]:
# nltk_ngram_svm_2.fit(False)
# nltk_ngram_svm_2.evaluate(False)

In [None]:
nltk_ngram_svm_2.fit(True)
nltk_ngram_svm_2.evaluate(True)

In [None]:
nltk_ngram_svm_3 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET,
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=False, lemmatize=True, remove_urls=True)),
                    ('tfidf', (TfidfVectorizer(ngram_range=(1, 3)))),
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", SVC(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_n_gram_rf_3"
)

In [None]:
# nltk_ngram_svm_3.fit(False)
# nltk_ngram_svm_3.evaluate(False)

In [None]:
nltk_ngram_svm_3.fit(True)
nltk_ngram_svm_3.evaluate(True)