In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from mlxtend.classifier import EnsembleVoteClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

# Read the dataset
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

vectorizer = TfidfVectorizer()

def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    #X_party1 = vectorizer.transform(df['first_party'])
    #X_party2 = vectorizer.transform(df['second_party'])

    #X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X_facts

X_train = get_vector(vectorizer, train, True)
y_train = train["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [25]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [26]:
# Define the base models
base_models = [
    ('rf', RandomForestClassifier()),
    ('lr', LogisticRegression()),
    ('lgbm', LGBMClassifier())
]

# Initialize the EnsembleVoteClassifier
ensemble = EnsembleVoteClassifier(clfs=[model for _, model in base_models], voting='soft')

# Train the base models
for name, model in base_models:
    model.fit(X_train, y_train)
    y_pred_val = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred_val)
    print(f"{name} Validation Accuracy: {accuracy}")

rf Validation Accuracy: 0.6270161290322581
lr Validation Accuracy: 0.5826612903225806
lgbm Validation Accuracy: 0.6169354838709677


In [27]:
import warnings
# Ignore warning messages
warnings.filterwarnings("ignore")

In [28]:
# Train the ensemble model
ensemble.fit(X_train, y_train)
y_pred_val_ensemble = ensemble.predict(X_val)
accuracy_ensemble = accuracy_score(y_val, y_pred_val_ensemble)
print(f"Ensemble Validation Accuracy: {accuracy_ensemble}")


Ensemble Validation Accuracy: 0.6129032258064516


In [29]:
# Predict on the test set using the ensemble model
y_pred_test = ensemble.predict(X_test)

# Prepare submission DataFrame
submission = pd.DataFrame({'ID': test['ID'], 'first_party_winner': y_pred_test})

# Save the submission file
submission.to_csv('ensemble_submission_3.csv', index=False)
print("Submission file saved.")


Submission file saved.


In [16]:
# Access the base models in the ensemble
for name, model in base_models:
    print(f"Base Model: {name}")
    print(f"Hyperparameters: {model.get_params()}")
    print("------------------------")


Base Model: rf
Hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
------------------------
Base Model: lr
Hyperparameters: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
------------------------
Base Model: lgbm
Hyperparameters: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight