In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from utils import utils, config

In [2]:
df, df_cv = utils.Utils.load_data(drop_neutral=False)

## Random Forest

In [3]:
nltk_w2v_rf_1 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET, 
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=False, lemmatize=False, remove_urls=False)),
                    ('word2vec', utils.Word2VecTransformer())
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", RandomForestClassifier(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_w2v_rf_1"
)

In [4]:
# nltk_w2v_rf_1.fit(False)
# nltk_w2v_rf_1.evaluate(False)

In [None]:
nltk_w2v_rf_1.fit(True)
nltk_w2v_rf_1.evaluate(True)

In [6]:
nltk_w2v_rf_1.dump(config.MODEL_DIR)

In [7]:
nltk_w2v_rf_2 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET, 
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=True, lemmatize=False, remove_urls=True)),
                    ('word2vec', utils.Word2VecTransformer())
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", RandomForestClassifier(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_w2v_rf_2"
)

In [8]:
# nltk_w2v_rf_2.fit(False)
# nltk_w2v_rf_2.evaluate(False)

In [None]:
nltk_w2v_rf_2.fit(True)
nltk_w2v_rf_2.evaluate(True)

In [10]:
nltk_w2v_rf_3 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET, 
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=False, lemmatize=True, remove_urls=True)),
                    ('word2vec', utils.Word2VecTransformer())
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", RandomForestClassifier(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_w2v_rf_3"
)

In [11]:
# nltk_w2v_rf_3.fit(False)
# nltk_w2v_rf_3.evaluate(False)

In [None]:
nltk_w2v_rf_3.fit(True)
nltk_w2v_rf_3.evaluate(True)

## Support Vector Machine

In [None]:
nltk_w2v_svm_1 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET, 
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=False, lemmatize=False, remove_urls=False)),
                    ('word2vec', utils.Word2VecTransformer())
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", SVC(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_w2v_svm_1"
)

In [None]:
nltk_w2v_svm_1.fit(False)
nltk_w2v_svm_1.evaluate(False)

In [None]:
nltk_w2v_svm_1.fit(True)
nltk_w2v_svm_1.evaluate(True)

In [None]:
nltk_w2v_svm_2 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET, 
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=True, lemmatize=False, remove_urls=True)),
                    ('word2vec', utils.Word2VecTransformer())
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", SVC(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_w2v_svm_2"
)

In [None]:
nltk_w2v_svm_2.fit(False)
nltk_w2v_svm_2.evaluate(False)

In [None]:
nltk_w2v_svm_2.fit(True)
nltk_w2v_svm_2.evaluate(True)

In [None]:
nltk_w2v_svm_3 = utils.CustomPipeline(
    df,
    config.FEATURES,
    config.TARGET, 
    df_cv=df_cv,
    steps=[
        ("preprocessor", ColumnTransformer(
            transformers=[
                (f'nltk_{feature}', Pipeline([
                    ('compound_splitter', utils.CompoundWordSplitter()),
                    ('tokenizer', utils.NLTKTokenizer(extra_stop_words=False, lemmatize=True, remove_urls=True)),
                    ('word2vec', utils.Word2VecTransformer())
                ]), feature) for feature in config.FEATURES]
        )),
        ("classifier", SVC(random_state=config.RANDOM_STATE))
    ],
    model_name="nltk_w2v_svm_3"
)

In [None]:
nltk_w2v_svm_3.fit(False)
nltk_w2v_svm_3.evaluate(False)

In [None]:
nltk_w2v_svm_3.fit(True)
nltk_w2v_svm_3.evaluate(True)