# SVM Model Implementation

## Load the Data Set

In [10]:
import pandas as pd
import ast

# Load training data
train_data = pd.read_csv('train_data.csv')
X_train_raw = train_data['X_train'].tolist()
y_train_raw = train_data['y_train'].tolist()

# Load test data
test_data = pd.read_csv('test_data.csv')
X_test_raw = test_data['X_test'].tolist()
y_test_raw = test_data['y_test'].tolist()


# Subsets for faster training
X_train_raw = X_train_raw[:10000]
y_train_raw = y_train_raw[:10000]

X_test_raw = X_test_raw[:2000]
y_test_raw = y_test_raw[:2000]


# #Make it a list
# X_train_raw = [ast.literal_eval(item) if isinstance(item, str) else item for item in X_train_raw]
# X_test_raw = [ast.literal_eval(item) if isinstance(item, str) else item for item in X_test_raw]

# Subset
X_train_raw = [ast.literal_eval(item) if isinstance(item, str) else item for item in X_train_raw]
X_test_raw = [ast.literal_eval(item) if isinstance(item, str) else item for item in X_test_raw]



## Vectorize and Encode the labels

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer


# Join into one big string
corpus_train = [' '.join(item) for item in X_train_raw]
corpus_test = [' '.join(item) for item in X_test_raw]


# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(corpus_train).toarray()
X_test = vectorizer.fit_transform(corpus_test).toarray()


# Initialize the CountVectorizer, limiting max_features to the top 500 words.
vectorizer = CountVectorizer(max_features=500) 

# # Fit the vectorizer on train and test data. Transform the data into BoW matrices.
# X_train = vectorizer.fit_transform(corpus_train).toarray()
# X_test = vectorizer.transform(corpus_test).toarray()

# Encode the taget variables
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train_raw)
y_test = encoder.fit_transform(y_test_raw)

## Default SVM Model

In [12]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


model = SVC(verbose=True)  
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[LibSVM].....*..*
optimization finished, #iter = 7228
obj = -1441.262514, rho = -0.128321
nSV = 4250, nBSV = 1032
..*..*
optimization finished, #iter = 4023
obj = -565.708923, rho = -0.331989
nSV = 2304, nBSV = 368
...*.*
optimization finished, #iter = 4993
obj = -961.865079, rho = -0.379527
nSV = 2925, nBSV = 665
...*.*
optimization finished, #iter = 4604
obj = -800.832494, rho = -0.915949
nSV = 2684, nBSV = 540
..*.*
optimization finished, #iter = 3077
obj = -267.970242, rho = -0.700839
nSV = 1792, nBSV = 173
..*..*
optimization finished, #iter = 4141
obj = -779.364556, rho = -0.552685
nSV = 2540, nBSV = 601
...*..*
optimization finished, #iter = 5432
obj = -887.323859, rho = -0.190359
nSV = 3073, nBSV = 567
...*..*
optimization finished, #iter = 5025
obj = -767.827042, rho = -0.750995
nSV = 2848, nBSV = 480
..*.*
optimization finished, #iter = 3150
obj = -288.844989, rho = -0.767872
nSV = 1874, nBSV = 195
.*.*
optimization finished, #iter = 2663
obj = -462.171943, rho = 0.064447
nSV

## Grid-Search 

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, make_scorer, f1_score, accuracy_score

# Define SVM model
model = SVC()

# Define the recall scorer
recall_scorer = make_scorer(recall_score)

# Define parameter grid
param_grid = {
    'C': [0.0001, 0.001, 0.1, 1],
    'kernel': ['linear'],
    'gamma': ['scale']
}

# Initialize Grid Search with recall scorer
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=accuracy_score, cv=3, verbose=1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Best parameters and best recall
print("Best parameters:", grid_search.best_params_)
print("Best Accuracy obtained: {:.2f}".format(grid_search.best_score_))


Fitting 3 folds for each of 16 candidates, totalling 48 fits


Traceback (most recent call last):
  File "/Users/carlosrabat/Desktop/AI-Term-Project/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/carlosrabat/Desktop/AI-Term-Project/.venv/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 191, in wrapper
    params = func_sig.bind(*args, **kwargs)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/inspect.py", line 3062, in bind
    return self._bind(args, kwargs)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/inspect.py", line 2988, in _bind
    raise TypeError(
TypeError: too many positional arguments

Traceback (most recent call last):
  File "/Users/carlosrabat/Desktop/AI-Term-Project/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scor

Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best Accuracy obtained: nan


## Optimized SVM Model

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Optimized Hyperparameter 
model = SVC(class_weight='balanced', C=0.1, gamma='scale', kernel='linear')  
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.294

Classification Report:
               precision    recall  f1-score   support

           0       0.35      0.37      0.36       594
           1       0.36      0.37      0.37       710
           2       0.19      0.16      0.17       176
           3       0.21      0.19      0.20       254
           4       0.12      0.12      0.12       200
           5       0.08      0.08      0.08        66

    accuracy                           0.29      2000
   macro avg       0.22      0.21      0.22      2000
weighted avg       0.29      0.29      0.29      2000

