In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from regex import compile
from multiprocessing import Pool

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Convert text to numerical representation
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

X_train_counts = count_vect.fit_transform(train_df['text'])
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, train_df['label'], test_size=0.2, random_state=42)

# Define models
lgbm_model = lgb.LGBMClassifier()
rf_model = RandomForestClassifier()
xgb_model = xgb.XGBClassifier()
catboost_model = CatBoostClassifier()

# Create ensemble model
ensemble_model = Pipeline([
    ('features', FeatureUnion([
        ('count_lgbm', CountVectorizer()),
        ('tfidf_lgbm', TfidfTransformer()),
        ('count_rf', CountVectorizer()),
        ('tfidf_rf', TfidfTransformer()),
        ('count_xgb', CountVectorizer()),
        ('tfidf_xgb', TfidfTransformer()),
        ('count_cat', CountVectorizer()),
        ('tfidf_cat', TfidfTransformer())
    ])),
    ('clf_lgbm', lgbm_model),
    ('clf_rf', rf_model),
    ('clf_xgb', xgb_model),
    ('clf_cat', catboost_model)
])

# Fit ensemble model on train data
ensemble_model.fit(X_train, y_train)

# Evaluate model on validation set
y_pred = ensemble_model.predict(X_val)
f1score = f1_score(y_val, y_pred, average='weighted')
print(f1score)

# Predict labels for test data
X_test_counts = count_vect.transform(test_df['text'])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
y_test_pred = ensemble_model.predict(X_test_tfidf)

# Save predictions to submission.csv
submission_df = pd.DataFrame({'id': test_df['id'], 'label': y_test_pred})
submission_df.to_csv('submission.csv', index=False)


TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'LGBMClassifier()' (type <class 'lightgbm.sklearn.LGBMClassifier'>) doesn't

In [None]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
STOPWORDS