## SVM

In [None]:
from sklearn.svm import SVC

pipeline = Pipeline([
      ('tfidf', TfidfVectorizer(
          preprocessor=preprocessText,
          ngram_range=(1, 2),
          max_features=6600
      )),
      ('clf', SVC(kernel='linear'))
  ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train,y_train)
y_pred_svm = pipeline.predict(X_test)

# Evaluate the SVM model
print("SVM Model Classification Report:")
print(classification_report(y_test, y_pred_svm))


In [None]:
def run_grid_searchSVM(param_grid):
  pipeline = Pipeline([
      ('tfidf', TfidfVectorizer()),
      ('clf', SVC())
  ])

  grid_search = GridSearchCV(
      pipeline,
      param_grid,
      cv=5,
      scoring='f1_weighted',    # Optimizing by f1
      n_jobs=-1,
      verbose=1
  )

  grid_search.fit(dataset['Review'], dataset['Liked'])

  print("Best params:", grid_search.best_params_)
  print("Best F1-score:", grid_search.best_score_)

In [None]:
param_grid_svm = {
    'clf__C': [0.1, 1, 10, 100],
    'clf__gamma': [1, 0.1, 0.01, 0.001],
    'clf__kernel': ['rbf', 'linear'],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__max_features': [1000, 5000]
}
# run_grid_searchSVM(param_grid_svm)
# Best params: {'clf__C': 100, 'clf__gamma': 0.1, 'clf__kernel': 'rbf', 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.8209770675991559

param_grid_svm = {
    'clf__C': [50, 70, 100],
    'clf__gamma': [0.5, 0.1, 0.05],
    'clf__kernel': ['rbf', 'linear'],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__max_features': [3000, 5000, 7000]
}
# run_grid_searchSVM(param_grid_svm)
# Best params: {'clf__C': 50, 'clf__gamma': 0.5, 'clf__kernel': 'rbf', 'tfidf__max_features': 7000, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.827973134694221

param_grid_svm = {
    'clf__C': [30, 50, 75],
    'clf__gamma': [0.8, 0.5, 0.3],
    'clf__kernel': ['rbf'],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__max_features': [5000, 7000, 9000]
}
# run_grid_searchSVM(param_grid_svm)
# Best params: {'clf__C': 30, 'clf__gamma': 0.5, 'clf__kernel': 'rbf', 'tfidf__max_features': 7000, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.827973134694221

param_grid_svm = {
    'clf__C': range(10,30,5),
    'clf__gamma': np.arange(0.3,0.8,0.1),
    'clf__kernel': ['rbf'],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__max_features': range(6000,8000,100)
}
# run_grid_searchSVM(param_grid_svm)
# Best params: {'clf__C': 10, 'clf__gamma': np.float64(0.5), 'clf__kernel': 'rbf', 'tfidf__max_features': 6600, 'tfidf__ngram_range': (1, 2)}
# Best F1-score: 0.8289823188556905

In [None]:
pipeline = Pipeline([
      ('tfidf', TfidfVectorizer(
          preprocessor=preprocessText,
          ngram_range=(1, 2),
          max_features=6600
      )),
      ('clf', SVC(
          C=10,
          gamma=0.5,
          kernel="rbf",
          ))
  ])

#Create "pure" data for pipeline
dataset = pd.read_csv("/root/.cache/kagglehub/datasets/hj5992/restaurantreviews/versions/1/Restaurant_Reviews.tsv", sep="\t")
X = dataset['Review']
y = dataset['Liked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Define the filename for your pipeline
filename = 'restaurant_review_pipelineSVM.joblib'

# Save the pipeline to the file
joblib.dump(pipeline, filename)

print(f"Pipeline saved to {filename}")