In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Split data into features and labels
X_train = train_df["text"]
y_train = train_df["label"]
X_test = test_df["text"]

# Vectorize features using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Split training data into training and validation sets
train_features, val_features, train_label, val_label = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_features, train_label)

# Make predictions on validation set
val_pred = rf.predict(val_features)

# Evaluate accuracy on validation set
val_accuracy = accuracy_score(val_label, val_pred)
print("Validation accuracy:", val_accuracy)

# Make predictions on test data
test_pred = rf.predict(X_test)

# Save predictions to submission file
submission_df = pd.DataFrame({"id": test_df["id"], "label": test_pred})
submission_df.to_csv("submission.csv", index=False)


Validation accuracy: 0.8354430379746836
