In [10]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from mlxtend.classifier import EnsembleVoteClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
# Read the dataset
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

def get_vector(df, train_mode):
    if train_mode:
        df['tokens'] = df['facts'].apply(tokenize)
    else:
        df['tokens'] = df['facts'].apply(tokenize)
    
    tokenized_text = df['tokens'].apply(lambda x: ' '.join(x))
    X_party1 = df['first_party']
    X_party2 = df['second_party']

    X = np.concatenate([X_party1.values.reshape(-1, 1), X_party2.values.reshape(-1, 1), tokenized_text.values.reshape(-1, 1)], axis=1)
    return X

X_train = get_vector(train, True)
y_train = train["first_party_winner"]
X_test = get_vector(test, False)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [13]:
# Tokenize text using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_text = vectorizer.fit_transform(X_train[:, 2])
X_test_text = vectorizer.transform(X_test[:, 2])

ct = ColumnTransformer([('encoder', OneHotEncoder(), [0, 1])], remainder='passthrough')
X_train_categorical = ct.fit_transform(X_train[:, :2])
X_test_categorical = ct.fit_transform(X_test[:, :2])


In [14]:
# Combine the encoded categorical features with the text features
X_train_encoded = np.concatenate([X_train_categorical, X_train_text.toarray()], axis=1)
X_test_encoded = np.concatenate([X_test_categorical, X_test_text.toarray()], axis=1)

# Split the data into train and validation sets
X_train_encoded, X_val_encoded, y_train, y_val = train_test_split(X_train_encoded, y_train, test_size=0.2, random_state=42)


ValueError: zero-dimensional arrays cannot be concatenated

In [5]:
# Define the base models
base_models = [
    ('rf', RandomForestClassifier()),
    ('lr', LogisticRegression()),
    ('lgbm', LGBMClassifier()),
    ('xgb', XGBClassifier())
]

# Initialize the EnsembleVoteClassifier
ensemble = EnsembleVoteClassifier(clfs=[model for _, model in base_models], voting='soft')

# Train the base models
for name, model in base_models:
    model.fit(X_train, y_train)
    y_pred_val = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred_val)
    print(f"{name} Validation Accuracy: {accuracy}")

ValueError: could not convert string to float: 'United States'