# categorical

In [6]:
import lightgbm as lgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Set parameters for classification
params = {
    'objective': 'multiclass', # Use multiclass objective for multi-class classification
    'num_class': 3, # Number of classes
    'metric': 'multi_error', # Use multi_error for multi-class classification
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'verbosity': -1, # Set verbosity to -1 to suppress output
    'seed': 42
}

# Train the model
num_round = 100
clf = lgb.train(params, train_data, num_round,
                valid_sets=[test_data],
                callbacks=[lgb.early_stopping(stopping_rounds=10)]) # Use early stopping callback

# Predict on the test set
y_pred = clf.predict(X_test)
y_pred = [list(x).index(max(x)) for x in y_pred]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1]	valid_0's multi_error: 0
Accuracy: 1.0


# Regression

In [9]:
import lightgbm as lgb
from sklearn.datasets import fetch_california_housing # Use California housing dataset instead
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load dataset
housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.2, random_state=42)

# Create LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Set parameters for regression
params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'metric': 'rmse',
    'verbosity': -1,
    'seed': 42
}

# Train the model
num_round = 100
reg = lgb.train(params, train_data, num_round, valid_sets=[test_data],
                callbacks=[lgb.early_stopping(stopping_rounds=10)])

# Predict on the test set
y_pred = reg.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 0.484058
RMSE: 0.484057560212197


# Ranking

In [10]:
import lightgbm as lgb
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split

# Load dataset (query, features, relevance)
X_train, y_train = load_svmlight_file('train.txt', query_id=True)
X_test, y_test = load_svmlight_file('test.txt', query_id=True)

# Create LightGBM dataset for ranking
train_data = lgb.Dataset(X_train, label=y_train, group=[X_train.queryid])
test_data = lgb.Dataset(X_test, label=y_test, group=[X_test.queryid])

# Set parameters for ranking
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'verbosity': -1,
    'seed': 42
}

# Train the model
num_round = 100
rank = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10)

FileNotFoundError: [Errno 2] No such file or directory: 'train.txt'