In [152]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np

In [133]:
def normalize_data(train_data, test_data, type=None):
    if type == None:
        return (train_data, test_data)
    elif type == 'l2':
        return (normalize(train_data, norm='l2'), normalize(test_data, norm='l2'))
    elif type == 'l1':
        return (normalize(train_data, norm='l1'), normalize(test_data, norm='l1'))


In [126]:
class BagOfWords:
    def __init__(self, data):
        self.data = data
        self.dict_id_cuvinte = dict()
        self.dict_frecv_cuvinte = dict()
        for text in data:
            for cuvant in text:
                cuvant = cuvant.strip()
                if cuvant not in self.dict_id_cuvinte:
                    self.dict_id_cuvinte.update({cuvant:len(self.dict_id_cuvinte)})
                    self.dict_frecv_cuvinte.update({cuvant:0})
                self.dict_frecv_cuvinte[cuvant] += 1

    def get_features(self, data):
        matrice = []
        for text in data:
            linie = []
            dict_cuvinte = dict()
            for cuvant in text:
                if cuvant not in dict_cuvinte:
                    dict_cuvinte.update({cuvant:0})
                dict_cuvinte[cuvant] += 1
            for cuvant_din_dict in self.dict_frecv_cuvinte:
                if cuvant_din_dict in dict_cuvinte:
                    linie.append(dict_cuvinte[cuvant_din_dict])
                else:
                    linie.append(0)
            matrice.append(linie)
        return matrice
        

In [149]:
train_data = np.load("training_sentences.npy", allow_pickle=True)
train_labels = np.load("training_labels.npy", allow_pickle=True)
test_data = np.load("test_sentences.npy", allow_pickle=True)
test_labels = np.load("test_labels.npy", allow_pickle=True)

In [128]:
bag = BagOfWords(train_data)
len(bag.dict_id_cuvinte)

9522

In [129]:
bow_train_data = bag.get_features(train_data)
bow_test_data = bag.get_features(test_data)

In [132]:
norm_bow_train_data, norm_bow_test_data = normalize_data(bow_train_data, bow_test_data, type ='l2')

In [158]:
C = 1
model = SVC(C = 1, kernel='linear')
model.fit(norm_bow_train_data, train_labels)

In [159]:
predictions = model.predict(norm_bow_test_data)

In [160]:
score = accuracy_score(test_labels, predictions)
score

0.9842391304347826

In [172]:
f1_scor = f1_score(test_labels, predictions)
f1_scor

TypeError: 'numpy.float64' object is not callable

In [186]:
max_coef = -1000
index_coef = -1
lista_1 = model.coef_.tolist()[0]
lista_2 = list(bag.dict_id_cuvinte)
feature_zip = zip(lista_1, lista_2)
feature_zip = sorted(feature_zip, key= lambda x : x[0])
feature_zip[:10]

[(-1.5205677595408342, '&lt#&gt'),
 (-1.1548610245925364, 'me'),
 (-1.0480471686435435, 'i'),
 (-0.9129849947306246, 'Going'),
 (-0.9103059947230119, 'him'),
 (-0.9081877355610026, 'Ok'),
 (-0.9012561474618785, 'I'),
 (-0.8841189959109963, 'Ill'),
 (-0.8469436292175161, 'my'),
 (-0.8446178462312, 'Im')]

In [187]:
feature_zip[-10:]

[(1.6860166576147766, 'Text'),
 (1.719852017272695, 'To'),
 (1.822243704854643, 'mobile'),
 (1.8359081155999426, 'CALL'),
 (1.9378176049163187, 'FREE'),
 (2.0004656327762125, 'txt'),
 (2.0473772767542147, '&'),
 (2.282199698507226, 'Call'),
 (2.5104356003217836, 'Txt'),
 (2.5109795106236437, 'STOP')]