In [3]:
import tensorflow_datasets as tfds
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split


data = tfds.load('imdb_reviews')
train_data, test_data = data['train'], data['test']

In [4]:
def extract_text_and_labels(dataset):
    texts = []
    labels = []
    for example in tfds.as_numpy(dataset):
        texts.append(example['text'].decode('utf-8'))
        labels.append(example['label'])
    return texts, labels


x_train_text, y_train = extract_text_and_labels(train_data)
x_test_text, y_test = extract_text_and_labels(test_data)


vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True, max_features=5000)
x_train_tfidf = vectorizer.fit_transform(x_train_text)
x_test_tfidf = vectorizer.transform(x_test_text)


x_train_tfidf, x_val_tfidf, y_train, y_val = train_test_split(x_train_tfidf, y_train, test_size=0.2, random_state=42)

In [5]:
def evaluate_kernel(kernel):
    svm_model = make_pipeline(StandardScaler(with_mean=False), SVC(kernel=kernel))
    svm_model.fit(x_train_tfidf, y_train)

    y_val_pred = svm_model.predict(x_val_tfidf)
    accuracy = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)

    print(f"Kernel: {kernel}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_val, y_val_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_val_pred))
    print("\n")


kernels = ['linear', 'rbf', 'poly']
for kernel in kernels:
    evaluate_kernel(kernel)

Kernel: linear
Validation Accuracy: 0.8158
F1 Score: 0.8108
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      2576
           1       0.81      0.81      0.81      2424

    accuracy                           0.82      5000
   macro avg       0.82      0.82      0.82      5000
weighted avg       0.82      0.82      0.82      5000

Confusion Matrix:
[[2106  470]
 [ 451 1973]]


Kernel: rbf
Validation Accuracy: 0.8846
F1 Score: 0.8830
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      2576
           1       0.87      0.90      0.88      2424

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.89      0.88      0.88      5000

Confusion Matrix:
[[2246  330]
 [ 247 2177]]


Kernel: poly
Validation Accuracy: 0.7358
F1 Score: 0.6444
Classification Report:
        