# SVM Model Implementation

## Load the Data Set

In [1]:
import pandas as pd
import ast

# Load training data
train_data = pd.read_csv('train_data.csv')
X_train_raw = train_data['X_train'].tolist()
y_train_raw = train_data['y_train'].tolist()

# Load test data
test_data = pd.read_csv('test_data.csv')
X_test_raw = test_data['X_test'].tolist()
y_test_raw = test_data['y_test'].tolist()


# Subsets for faster training
X_train_raw = X_train_raw[:10000]
y_train_raw = y_train_raw[:10000]

X_test_raw = X_test_raw[:2000]
y_test_raw = y_test_raw[:2000]


# #Make it a list
# X_train_raw = [ast.literal_eval(item) if isinstance(item, str) else item for item in X_train_raw]
# X_test_raw = [ast.literal_eval(item) if isinstance(item, str) else item for item in X_test_raw]

# Subset
X_train_raw = [ast.literal_eval(item) if isinstance(item, str) else item for item in X_train_raw]
X_test_raw = [ast.literal_eval(item) if isinstance(item, str) else item for item in X_test_raw]



## Vectorize and Encode the labels

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer


# Join into one big string
corpus_train = [' '.join(item) for item in X_train_raw]
corpus_test = [' '.join(item) for item in X_test_raw]


# # Initialize a TF-IDF Vectorizer
# vectorizer = TfidfVectorizer(max_features=1000)
# X_train = vectorizer.fit_transform(corpus_train).toarray()
# X_test = vectorizer.fit_transform(corpus_test).toarray()


# Initialize the CountVectorizer, limiting max_features to the top 500 words.
vectorizer = CountVectorizer(max_features=500) 

# Fit the vectorizer on train and test data. Transform the data into BoW matrices.
X_train = vectorizer.fit_transform(corpus_train).toarray()
X_test = vectorizer.transform(corpus_test).toarray()

# Encode the taget variables
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train_raw)
y_test = encoder.fit_transform(y_test_raw)

## Default SVM Model

In [3]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


model = SVC(verbose=True)  
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[LibSVM]....*..*
optimization finished, #iter = 6451
obj = -2657.303234, rho = -0.091203
nSV = 4661, nBSV = 2659
...*..*
optimization finished, #iter = 5448
obj = -893.964746, rho = -0.539916
nSV = 2362, nBSV = 709
....*.*
optimization finished, #iter = 5696
obj = -1523.526058, rho = -0.214224
nSV = 3063, nBSV = 1291
...*..*
optimization finished, #iter = 5197
obj = -1344.704600, rho = -0.444261
nSV = 2819, nBSV = 1120
..*.*
optimization finished, #iter = 3642
obj = -470.351657, rho = -0.769216
nSV = 1724, nBSV = 341
...*..*
optimization finished, #iter = 5483
obj = -1098.370813, rho = -0.606489
nSV = 2617, nBSV = 874
....*..*
optimization finished, #iter = 6740
obj = -1424.994203, rho = -0.195477
nSV = 3165, nBSV = 1145
....*..*
optimization finished, #iter = 6236
obj = -1296.808625, rho = -0.418512
nSV = 2938, nBSV = 1062
..*.*
optimization finished, #iter = 3954
obj = -492.800885, rho = -0.792483
nSV = 1837, nBSV = 344
..*
optimization finished, #iter = 2734
obj = -784.306349, rho =

## Grid-Search 

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, make_scorer, f1_score, accuracy_score

# Define SVM model
model = SVC()

# Define the recall scorer
recall_scorer = make_scorer(recall_score)

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'polynomial'],
    'gamma': ['scale', 'auto'],
    'class_weight' : ['balanced', None]
}

# Initialize Grid Search with recall scorer
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=accuracy_score, cv=3, verbose=1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Best parameters and best recall
print("Best parameters:", grid_search.best_params_)
print("Best Accuracy obtained: {:.2f}".format(grid_search.best_score_))


Fitting 3 folds for each of 32 candidates, totalling 96 fits


Traceback (most recent call last):
  File "/Users/carlosrabat/Desktop/AI-Term-Project/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/carlosrabat/Desktop/AI-Term-Project/.venv/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 191, in wrapper
    params = func_sig.bind(*args, **kwargs)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/inspect.py", line 3062, in bind
    return self._bind(args, kwargs)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/inspect.py", line 2988, in _bind
    raise TypeError(
TypeError: too many positional arguments

Traceback (most recent call last):
  File "/Users/carlosrabat/Desktop/AI-Term-Project/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scor

## Optimized SVM Model

In [5]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Optimized Hyperparameter 
model = SVC(class_weight='balanced', C=1, gamma='scale', kernel='linear')  
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.602

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.61      0.66       594
           1       0.69      0.65      0.67       710
           2       0.46      0.62      0.53       176
           3       0.64      0.57      0.60       254
           4       0.43      0.49      0.46       200
           5       0.22      0.45      0.29        66

    accuracy                           0.60      2000
   macro avg       0.53      0.57      0.53      2000
weighted avg       0.63      0.60      0.61      2000

