In [22]:
import numpy as np
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics import f1_score

In [23]:
#functie accuratete
def accuracy_score(true_labels, predicted_labels):
    return (true_labels==predicted_labels).mean()

In [8]:
def normalize_data(train_data, test_data, type=None):
    scaler = None
    if type == 'standard':
        scaler = preprocessing.StandardScaler()


    elif type == 'l1':
        scaler = preprocessing.Normalizer(norm='l1')

    elif type == 'l2':
        scaler = preprocessing.Normalizer(norm='l2')

    if scaler is not None:
        scaler.fit(train_data)
        scaled_train_data = scaler.transform(train_data)
        scaled_test_data = scaler.transform(test_data) 
        return (scaled_train_data, scaled_test_data)
    else:
        print("No scaling was performed. Raw data is returned.")
        return (train_data, test_data)

In [9]:
# load data
training_data = np.load('data/training_sentences.npy', allow_pickle=True)
training_labels = np.load('data/training_labels.npy')

test_data = np.load('data/test_sentences.npy', allow_pickle=True)
test_labels = np.load('data/test_labels.npy')

In [10]:
training_data

array([list(['Probably', 'not', 'still', 'going', 'over', 'some', 'stuff', 'here']),
       list(['I', 'HAVE', 'A', 'DATE', 'ON', 'SUNDAY', 'WITH', 'WILL']),
       list(['Thanks', '4', 'your', 'continued', 'support', 'Your', 'question', 'this', 'week', 'will', 'enter', 'u', 'in2', 'our', 'draw', '4', 'Â£100', 'cash', 'Name', 'the', 'NEW', 'US', 'President', 'txt', 'ans', 'to', '80082']),
       ...,
       list(['OH', 'FUCK', 'JUSWOKE', 'UP', 'IN', 'A', 'BED', 'ON', 'A', 'BOATIN', 'THE', 'DOCKS', 'SLEPT', 'WID', '25', 'YEAR', 'OLD', 'SPINOUT', 'GIV', 'U', 'DA', 'GOSSIP', 'L8R', 'XXX']),
       list(['NOT', 'MUCH', 'NO', 'FIGHTS', 'IT', 'WAS', 'A', 'GOOD', 'NITE']),
       list(['Did', 'he', 'just', 'say', 'somebody', 'is', 'named', 'tampa'])],
      dtype=object)

In [11]:
class Bag_of_words:

    def __init__(self):
        self.vocabulary = {}
        self.words = []
        self.vocabulary_length = 0

    def build_vocabulary(self, data):
        for document in data:
            for word in document:
                # word = word.lower()
                if word not in self.vocabulary.keys():
                    self.vocabulary[word] = len(self.vocabulary)
                    self.words.append(word)

        self.vocabulary_length = len(self.vocabulary)
        self.words = np.array(self.words)
        
    def get_features(self, data):
        features = np.zeros((len(data), self.vocabulary_length))

        for document_idx, document in enumerate(data):
            for word in document:
                if word in self.vocabulary.keys():
                    features[document_idx, self.vocabulary[word]] += 1
        return features

In [12]:
bow_model = Bag_of_words()
bow_model.build_vocabulary(training_data) 

In [17]:
print(len(bow_model.vocabulary))

9522


In [18]:
train_features = bow_model.get_features(training_data)
test_features = bow_model.get_features(test_data) 

In [19]:
print(train_features.shape)
print(test_features.shape)

(3734, 9522)
(1840, 9522)


In [20]:
scaled_train_data, scaled_test_data = normalize_data(train_features, test_features, type='l2')

In [24]:
svm_model = svm.SVC(C=10, kernel='linear')
svm_model.fit(scaled_train_data, training_labels)
predicted_labels_svm = svm_model.predict(scaled_test_data)
model_accuracy_svm = accuracy_score(np.asarray(test_labels), predicted_labels_svm)
print('f1 score', f1_score(np.asarray(test_labels), predicted_labels_svm))
print("SVM model accuracy: ", model_accuracy_svm * 100)

f1 score 0.9598393574297189
SVM model accuracy:  98.91304347826086
