In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

# Load the training data
train_df = pd.read_csv('train.csv')

# Define stop words
stop_words = set(stopwords.words('english'))

# Preprocess the text data
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Tokenize the words
    words = word_tokenize(text)
    
    # Remove stop words
    filtered_words = [word for word in words if word not in stop_words]
    
    # Join the words back into a string
    text = ' '.join(filtered_words)
    
    return text

train_df['text'] = train_df['text'].apply(preprocess_text)

# Define the number of folds for k-fold cross-validation
num_folds = 5

# Split the data into input and target
X = train_df['text']
y = train_df['label']

# Vectorize the text data using the TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X).astype('float64')

# Define the LightGBM model
lgb_model = LGBMClassifier(learning_rate=0.1, max_depth=7, n_estimators=100)

# Define the cross-validation method
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize the f1 score list
f1_scores = []

# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(kf.split(X)):
    print('Fold', fold+1)
    
    # Split the data into training and validation sets
    X_train, X_val = X[train_indices], X[val_indices]
    y_train, y_val = y[train_indices], y[val_indices]
    
    # Train the LightGBM model on the training set
    lgb_model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = lgb_model.predict(X_val)
    
    # Calculate the f1 score on the validation set
    f1score = f1_score(y_val, y_pred, average='weighted')
    f1_scores.append(f1score)
    
    print('F1 Score on the validation set:', f1score)

# Print the average f1 score across all folds
print('Average F1 Score:', np.mean(f1_scores))

# Load the test data
test_df = pd.read_csv('test.csv')

# Preprocess the text data in the test set
test_df['text'] = test_df['text'].apply(preprocess_text)

# Vectorize the text data in the test set
X_test = vectorizer.transform(test_df['text']).astype('float64')

# Train the LightGBM model on the full training set
lgb_model.fit(X, y)

# Make predictions on the test set
test_df['label'] = lgb_model.predict(X_test)

# Save the predictions to a CSV file
test_df[['id', 'label']].to_csv('submission.csv', index=False)


Fold 1
F1 Score on the validation set: 0.8862357081047154
Fold 2
F1 Score on the validation set: 0.8936032831256876
Fold 3
F1 Score on the validation set: 0.8882945025419718
Fold 4
F1 Score on the validation set: 0.8930732506541553
Fold 5
F1 Score on the validation set: 0.8897505065316854
Average F1 Score: 0.8901914501916431
