In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Read the dataset
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

vectorizer = TfidfVectorizer()
def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X

X_train = get_vector(vectorizer, train, True)
y_train = train["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [3]:
# Train the Random Forest classifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

# Train the Logistic Regression classifier
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

# Train the LightGBM classifier
lgbm_model = LGBMClassifier()
lgbm_model.fit(X_train, y_train)

# Predict on the validation set
y_pred_val_rf = random_forest_model.predict(X_val)
y_pred_val_lr = logistic_regression_model.predict(X_val)
y_pred_val_lgbm = lgbm_model.predict(X_val)

# Combine predictions using majority voting
y_pred_val_combined = np.round((y_pred_val_rf + y_pred_val_lr + y_pred_val_lgbm) / 3)

# Evaluate the ensemble model
accuracy = accuracy_score(y_val, y_pred_val_combined)
print("Validation Accuracy:", accuracy)





Validation Accuracy: 0.6451612903225806


In [5]:
# Predict on the test set
y_pred_test_rf = random_forest_model.predict(X_test)
y_pred_test_lr = logistic_regression_model.predict(X_test)
y_pred_test_lgbm = lgbm_model.predict(X_test)

# Combine predictions using majority voting
y_pred_test_combined = np.round((y_pred_test_rf + y_pred_test_lr + y_pred_test_lgbm) / 3)

# Prepare submission DataFrame
submission = pd.DataFrame({'ID': test['ID'], 'first_party_winner': y_pred_test_combined})

# Save the submission file
submission.to_csv('logistic_randomforest_lgbm_ensemble_submission.csv', index=False)
print("Submission file saved.")



Submission file saved.


