In [32]:
# Import necessary packages
import copy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn. compose import ColumnTransformer 
from sklearn.metrics import *
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier,GradientBoostingClassifier, StackingClassifier
import xgboost as xgb
import stacking as sk

RANDOM_STATE : int = 42
TARGET_NAME : str = "target"

### candidates models
| Vectorizer |stop_words | ngram_range  |  min_df | max_features  |  max_df | Model| penalty | C 
|---|---|---|---|---|---|---|---|---|
| CountVectorizer |english |  (1, 2) | 2  |  3500 | 0.5  |  RandomForestClassifier(max_depth=20, n_estimators=100)| NaN|NaN |
| TfidfVectorizer/CountVectorizer |english |  (1, 1) | 2  |  3500 | 0.7  | RandomForestClassifier(max_depth=20, n_estimators=50)| NaN|NaN |
| TfidfVectorizer |english |  (1, 1)|  1 |  2000 |  0.7 | LogisticRegression(C=0.1, solver='saga')|	L2|  0.1|
| TfidfVectorizer/CountVectorizer |english |  (1, 2)|  2 |  3000 |  0.7 | GradientBoostingClassifier(max_depth=20, n_estimators=250)|NaN|NaN |

In [36]:
# Setting up the CountVectorizer for Classifiers (Estimators)
text_preprocessing_a = Pipeline([('Vect', CountVectorizer(stop_words='english', max_features=3500, min_df=2, max_df=0.5, ngram_range=(1,2)))])
text_preprocessing_b = Pipeline([('Vect', CountVectorizer(stop_words='english',max_features=3500, min_df=2, max_df=0.7, ngram_range=(1,1)))])
text_preprocessing_c = Pipeline([('Vect', CountVectorizer(stop_words='english',max_features=2000, min_df=1, max_df=0.7, ngram_range=(1,1)))])
text_preprocessing_d = Pipeline([('Vect', CountVectorizer(stop_words='english',max_features=3000, min_df=2, max_df=0.7, ngram_range=(1,2)))])

# Setting up the TfidfVectorizer for Classifiers (Estimators)
text_preprocessing_2 = Pipeline([('Vect', TfidfVectorizer(stop_words='english',max_features=3500, min_df=2, max_df=0.7, ngram_range=(1,1)))])
text_preprocessing_3 = Pipeline([('Vect', TfidfVectorizer(stop_words='english',max_features=2000, min_df=1, max_df=0.7, ngram_range=(1,1)))])
text_preprocessing_4 = Pipeline([('Vect', TfidfVectorizer(stop_words='english',max_features=3000, min_df=2, max_df=0.7, ngram_range=(1,2)))])

def pre(proc):
    return ColumnTransformer(
                            [('text_preprocessing', proc, 'lem_text')]
                            )

# Moodels with CountVectorizer                      
Model_C1 = Pipeline([
    ('pre', pre( text_preprocessing_a )),
    ('classifier', RandomForestClassifier(max_depth=20, n_estimators=100))
])

Model_C2 = Pipeline([
    ('pre', pre( text_preprocessing_b )),
    ('classifier', RandomForestClassifier(max_depth=20, n_estimators=50))
])

Model_C3 = Pipeline([
    ('pre', pre( text_preprocessing_d )),
    ('classifier', GradientBoostingClassifier(max_depth=20, n_estimators=250))
])

# Moodels with TfidfVectorizer
Model_T1 = Pipeline([
    ('pre', pre( text_preprocessing_2 )),
    ('classifier', RandomForestClassifier(max_depth=20, n_estimators=50))
])

Model_T2 = Pipeline([
    ('pre', pre( text_preprocessing_3 )),
    ('classifier', LogisticRegression(C=0.1, solver='saga'))
])

Model_T3 = Pipeline([
    ('pre', pre( text_preprocessing_4 )),
    ('classifier', GradientBoostingClassifier(max_depth=20, n_estimators=250))
])

In [6]:
df = pd.read_csv('data_nlp.csv')
df = df[['lem_text','y']]
y = df['y']
X = df.drop('y', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Final Model 3: Stacking based on candidates models

![](output/Screenshot%202022-11-24%20at%201.25.04%20AM.png)

In [37]:
level_1_classifiers = dict()
level_1_classifiers["C1"] = Model_C1
level_1_classifiers["C2"] = Model_C2
level_1_classifiers["C3"] = Model_C3
level_1_classifiers["T1"] = Model_T1
level_1_classifiers["T2"] = Model_T2
level_1_classifiers["T3"] = Model_T3

# meta classifier
level_2_classifier = LogisticRegression(random_state=RANDOM_STATE)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
stacking_model = StackingClassifier(estimators=list(level_1_classifiers.items()), final_estimator=level_2_classifier, passthrough=True, cv=kfold, stack_method="predict_proba")

level_1_columns = [f"{name}_prediction" for name in level_1_classifiers.keys()]
pd.DataFrame(stacking_model.fit_transform(X_train, y_train), columns=level_1_columns + list(X_train.columns))


ValueError: could not convert string to float: 'hear forever stamp thinking secure investment utility appreciate backed reputable institution hit forever stamp figured degens would fun poking hole thesis forever stamp released may stamp forever stamp today may stamp year stamp appreciation mail letter mean come may attractive investment yield great return really resell marketplace blackmarket hot dog grandma interested dumping net worth forever stamp ampx bwhat thought'

In [None]:
y_test_pred0 = stacking_model.predict(X_test)
y_test_pred0

In [None]:

level_1 = sk.Level1Stacker(copy.deepcopy(level_1_classifiers), passthrough=True, save_x=True)
level_2 = sk.Level2Stacker(LogisticRegression(random_state=RANDOM_STATE))

final_stacking_model = Pipeline([
                            ('level_1', level_1), 
                            ('level_2', level_2) 
                            ])

final_stacking_model.fit(X_train, y_train)
#level_1.X

In [None]:
y_test_pred = final_stacking_model.predict(X_test)
y_test_pred

In [None]:
print(f"Accuracy of scikit-learn stacking classifier: {accuracy_score(y_test, y_test_pred)}")

for name, classifier in level_1_classifiers.items():
    classifier_ = copy.deepcopy(classifier)
    classifier_.fit(X_train, y_train)

    print(f"Accuracy of standalone {name} classifier: {accuracy_score(y_test, classifier_.predict(X_test))}")
