In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

# Load the training data
train_df = pd.read_csv('train.csv')

# Define stop words
stop_words = set(stopwords.words('english'))

# Preprocess the text data
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Tokenize the words
    words = word_tokenize(text)
    
    # Remove stop words
    filtered_words = [word for word in words if word not in stop_words]
    
    # Join the words back into a string
    text = ' '.join(filtered_words)
    
    return text

train_df['text'] = train_df['text'].apply(preprocess_text)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['text'], train_df['label'], test_size=0.2, random_state=42)

# Vectorize the text data using the CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)

# Train the Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = nb.predict(X_val)

# Evaluate the performance of the model using F1 score
f1 = f1_score(y_val, y_pred, average='weighted')
print('F1 score:', f1)

# Preprocess the test data
test_df = pd.read_csv('test.csv')
test_df['text'] = test_df['text'].apply(preprocess_text)
X_test = vectorizer.transform(test_df['text'])

# Make predictions on the test data
y_test_pred = nb.predict(X_test)

# Create a submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'label': y_test_pred})
submission_df.to_csv('submission.csv', index=False)


F1 score: 0.8179277235184726
