In [2]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score

# Load dataset
dataset = pd.read_csv('Breast Cancer Detection Classification.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create Dataset for training and test data (important for using the train method)
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Set parameters for the model
params = {
    'objective': 'multiclass',  # Multiclass classification
    'num_class': 3,             # Number of classes
    'learning_rate': 0.1,
    'max_depth': 6,
    'metric': 'multi_logloss'
}

# Train the model using the train method
model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[test_data])

# Predict on the test set
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# Since the model returns probabilities for each class, we select the class with the highest probability
y_pred_class = [list(probabilities).index(max(probabilities)) for probabilities in y_pred]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_class)
print(f"Accuracy using train method with Dataset: {accuracy * 100:.2f}%")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4111
[LightGBM] [Info] Number of data points in the train set: 398, number of used features: 31
[LightGBM] [Info] Start training from score -0.468999
[LightGBM] [Info] Start training from score -0.982506
[LightGBM] [Info] Start training from score -34.538776
Accuracy using train method with Dataset: 95.32%
