# Prepare data

In [1]:
from pyarrow.feather import read_feather
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from utilpy import sv, ld
from IPython.utils.io import capture_output

import numpy as np
import os

workdir = '/home/yu/OneDrive/NewsReason/local-dev'
os.chdir(workdir)

In [6]:
# read data
dataset = read_feather('/home/yu/OneDrive/NewsReason/local-dev/data/annotation/batch-5/2-annotated/annotated_agreed_full_batch3_4_5.feather')

# encode string labels to integers
dataset['intlabel'] = LabelEncoder().fit_transform(dataset['first_reason_type'])

# split dataset into train and test
train_size = 1500  # first X samples
test_size = 200  # last X samples

x_train = dataset['text'][:train_size]
x_test = dataset['text'][-test_size:]
t_train = dataset['intlabel'][:train_size]
t_test = dataset['intlabel'][-test_size:]

print(f'{x_train.shape=}, {x_test.shape=}, {t_train.shape=}, {t_test.shape=}')

# train model
def svc_pipleline():
     return Pipeline(
          [
               ('tfidf_vector', TfidfVectorizer(
                    ngram_range=(1,1),
                    analyzer='word',
                    input='array',
                    norm='l2',
                    max_features=None,
                    min_df=1,
                    sublinear_tf=True,
                    stop_words='english')),
               ('clf', SVC(
                    C=10,
                    kernel="rbf",
                    gamma=0.1,
                    probability=True,
                    class_weight=None))])
                    
def print_metrics(pred_test, y_test, pred_train, y_train):
    print("test accuracy", str(np.mean(pred_test == y_test)))
    print("train accuracy", str(np.mean(pred_train == y_train)))
    print("\n Metrics and Confusion for SVM \n")
    # print(metrics.confusion_matrix(y_test, pred_test))
    # print(metrics.classification_report(y_test, pred_test))

with capture_output() as captured:
    svc_pipe = svc_pipleline()
    svc_pipe.fit(x_train, t_train)
    pred_test = svc_pipe.predict(x_test)
    pred_train = svc_pipe.predict(x_train)

print_metrics(pred_test, t_test, pred_train, t_train)

x_train.shape=(1500,), x_test.shape=(200,), t_train.shape=(1500,), t_test.shape=(200,)
test accuracy 0.77
train accuracy 0.9993333333333333

 Metrics and Confusion for SVM 

