In [23]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split , GridSearchCV
import pandas as pd

In [24]:
df = pd.read_csv("/input/nlp-getting-started/train.csv")

In [25]:
x = df['text']
y = df['target']

In [26]:
tfidf_vectorizer = TfidfVectorizer()

In [27]:
x_tfidf = tfidf_vectorizer.fit_transform(x)

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf,
                                                    y, 
                                                    test_size=0.2,
                                                    random_state=42 ,
                                                    shuffle = True)

In [29]:
lgbm_model = LGBMClassifier()

In [30]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'num_leaves': [20, 30, 40]
}

In [31]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=lgbm_model, param_grid=param_grid, cv=5, scoring='accuracy')

In [32]:
grid_search.fit(x_train, y_train)

[LightGBM] [Info] Number of positive: 2097, number of negative: 2775
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017902 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11703
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 545
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430419 -> initscore=-0.280143
[LightGBM] [Info] Start training from score -0.280143
[LightGBM] [Info] Number of positive: 2097, number of negative: 2775
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11677
[LightGBM] [Info] Number of data points in the train set: 4872, number of used features: 552
[LightGBM] [Info] [b

In [33]:
# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

In [34]:
y_pred = best_estimator.predict(x_test)

In [35]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of LightGBM Classifier after hyperparameter tuning:", accuracy)
print("Best Parameters:", best_params)

Accuracy of LightGBM Classifier after hyperparameter tuning: 0.788575180564675
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 30}
