In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from mlxtend.classifier import EnsembleVoteClassifier
from lightgbm import LGBMClassifier

# Read the dataset
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

vectorizer = TfidfVectorizer()

def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])

    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X

X_train = get_vector(vectorizer, train, True)
y_train = train["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [6]:
# Define the base models
base_models = [
    ('rf', RandomForestClassifier()),
    ('lr', LogisticRegression()),
    ('lgbm', LGBMClassifier())
]

# Initialize the EnsembleVoteClassifier
ensemble = EnsembleVoteClassifier(clfs=[model for _, model in base_models], voting='soft')

# Train the base models
for name, model in base_models:
    model.fit(X_train, y_train)
    y_pred_val = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred_val)
    print(f"{name} Validation Accuracy: {accuracy}")

rf Validation Accuracy: 0.6512096774193549
lr Validation Accuracy: 0.6532258064516129
lgbm Validation Accuracy: 0.6108870967741935


In [5]:
import warnings
# Ignore warning messages
warnings.filterwarnings("ignore")

In [7]:
# Train the ensemble model
ensemble.fit(X_train, y_train)
y_pred_val_ensemble = ensemble.predict(X_val)
accuracy_ensemble = accuracy_score(y_val, y_pred_val_ensemble)
print(f"Ensemble Validation Accuracy: {accuracy_ensemble}")


Ensemble Validation Accuracy: 0.6411290322580645


In [11]:
# Predict on the test set using the ensemble model
y_pred_test = ensemble.predict(X_test)

# Prepare submission DataFrame
submission = pd.DataFrame({'ID': test['ID'], 'first_party_winner': y_pred_test})

# Save the submission file
submission.to_csv('ensemble_submission.csv', index=False)
print("Submission file saved.")


Submission file saved.
