In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv("topic-analysis-dataset.csv")  # Make sure the CSV is in the same directory

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['topic'], test_size=0.2, random_state=42
)

# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train the SVM classifier
svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = svm.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)


Accuracy: 0.9877
Classification Report:
              precision    recall  f1-score   support

        book       0.98      0.99      0.98      1025
       movie       1.00      0.99      1.00       979
      sports       0.99      0.97      0.98       933

    accuracy                           0.99      2937
   macro avg       0.99      0.99      0.99      2937
weighted avg       0.99      0.99      0.99      2937



In [9]:
df = pd.read_csv("../test-datasets/sentiment-topic-test.tsv", sep="\t")
sentences = df["sentence"].tolist()
true_labels = df["topic"].tolist()

X_test_tfidf = vectorizer.transform(sentences )
predicted_labels = svm.predict(X_test_tfidf)

print("Classification Report:")
print(classification_report(true_labels, predicted_labels))

Classification Report:
              precision    recall  f1-score   support

        book       0.42      0.83      0.56         6
       movie       1.00      0.50      0.67         6
      sports       0.67      0.33      0.44         6

    accuracy                           0.56        18
   macro avg       0.69      0.56      0.56        18
weighted avg       0.69      0.56      0.56        18

