In [1]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC 
from sklearn.pipeline import Pipeline
import time

In [3]:
# Load train dataset
df = pd.read_csv('../../../data/train_data.csv')


In [4]:
# Build the SVM pipeline
svm_pipeline = Pipeline([ # TF-IDF vectorization
    ('tfidf', TfidfVectorizer(ngram_range=(1,3), # Use unigrams, bigrams, and trigrams
                              min_df=2,          # Minimum document frequency
                              stop_words='english', # Remove English stop words
                              sublinear_tf=True # Apply sublinear term frequency scaling
                              )), 
    ('svm', SVC(kernel='linear', # Use linear kernel
                C=1.0, # Regularization parameter
                probability=True, # Enable probability estimates
                random_state=42 # Random seed
                ))  # Support Vector Machine classifier
])



In [5]:
# Initialize time for recording inference time
start = time.time()

# train on train_data
svm_pipeline.fit(df["text"].astype(str), # convert to string
                 df["Analysis"])
print("Pipeline training complete.")

end = time.time()
training_time = end - start

# Save computation time to CSV
time_df = pd.DataFrame({"svm_TrainingTime (s)": [training_time]})
time_df.to_csv("computation_time_svm.csv", index=False)

print("Training time: ", training_time, "seconds")
print("Training time is saved")

Pipeline training complete.
Training time:  55.86406183242798 seconds
Training time is saved


In [6]:
from sklearn.metrics import classification_report, accuracy_score

# Load validation dataset
val_df = pd.read_csv('../../../data/val_data.csv')
val_text = val_df["text"].astype(str)  # convert to string
val_labels = val_df["Analysis"]
val_preds = svm_pipeline.predict(val_text)

print("Validation Accuracy:", accuracy_score(val_labels, val_preds))
print(classification_report(val_labels, val_preds))


Validation Accuracy: 0.7423851316468766
              precision    recall  f1-score   support

    Negative       0.75      0.48      0.58       460
     Neutral       0.73      0.68      0.70       427
    Positive       0.75      0.88      0.81      1050

    accuracy                           0.74      1937
   macro avg       0.74      0.68      0.70      1937
weighted avg       0.74      0.74      0.73      1937



In [7]:
# Save the trained model
import joblib
joblib.dump(svm_pipeline, 'svm.joblib')

['svm.joblib']