In [34]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [36]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [38]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df['Text'], train_df['Subject'], 
    test_size=0.2, random_state=42, stratify=train_df['Subject']
)

In [40]:
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 1))),
    ('clf', LinearSVC(class_weight='balanced', max_iter=2000))
])


In [42]:
# Train model
model.fit(X_train, y_train)


In [44]:
# Validate performance
y_val_pred = model.predict(X_val)
val_f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Validation Macro F1 Score: {val_f1:.4f}")

Validation Macro F1 Score: 0.8436


In [46]:
# Retrain on full training data
model.fit(train_df['Text'], train_df['Subject'])

In [48]:
# Predict on test data
test_predictions = model.predict(test_df['Text'])

In [50]:
# Prepare submission
submission_df = pd.DataFrame({
    'ID': test_df['ID'],
    'Subject': test_predictions
})

In [52]:
# Save file
submission_df.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv
