In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Read the dataset
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

vectorizer = TfidfVectorizer()
def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X

X_train = get_vector(vectorizer, train, True)
y_train = train["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [5]:
import warnings
# Ignore warning messages
warnings.filterwarnings("ignore")

# Define the parameter grid for hyperparameter tuning
param_grid = {
    #'n_estimators': [50, 100, 200, 300],
    #'learning_rate': [0.1, 0.05, 0.01],
    #'num_leaves': [30, 50, 100],
    #'max_depth': [3, 5, 7],
    #'min_child_samples': [10, 20, 30],
    #'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0.0, 0.1, 0.5],
    'reg_lambda': [0.0, 0.1, 0.5]
}

# Create the LightGBM classifier
lgbm_model = LGBMClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=lgbm_model, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

In [14]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier(learning_rate=0.01, n_estimators=50, num_leaves=30, max_depth=3,
                            min_child_samples=10, subsample=0.8,
                            colsample_bytree=0.8, reg_alpha=0.5, reg_lambda=0.0)

lgbm_model.fit(X_train, y_train)

y_pred_val_lgbm = lgbm_model.predict(X_val)

Best Hyperparameters: {'learning_rate': 0.01, 'n_estimators': 50, 'num_leaves': 30}
Best Hyperparameters: {'max_depth': 3, 'min_child_samples': 10, 'subsample': 0.8}
Best Hyperparameters: {'colsample_bytree': 0.8, 'reg_alpha': 0.5, 'reg_lambda': 0.0}

In [15]:

accuracy = accuracy_score(y_val, y_pred_val_lgbm)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.6612903225806451


In [16]:
# Predict on the test set using the best model
y_pred_test = lgbm_model.predict(X_test)

# Prepare submission DataFrame
submission = pd.DataFrame({'ID': test['ID'], 'first_party_winner': y_pred_test})

# Save the submission file
submission.to_csv('lgbm_hyped_submission_2.csv', index=False)
print("Submission file saved.")

Submission file saved.
