In [None]:
# Duomenų rinkinys

In [15]:
# ----- Dataset 1 ("dataset-spam.csv") (nemodifikuota) -----
import pandas as pd

df = pd.read_csv("./dataset-spam.csv", encoding="latin1")

# Žiūrime bendrą duomenų rinkinio statistiką
print("Bendra statistika", 'value', ":")
print(df['value'].value_counts())

# Tikriname tuščias reikšmes
print("\nTuščios reikšmės:")
print(df.isnull().sum())

# Duomenys:
# Viso: 5572
# Ham: 4825
# Spam: 747
# Reikalingas modelio efektyvumas: > 86.6%
# Komentaras: value ("ham" arba "spam")
# (tuščių reikšmių nėra)

Bendra statistika value :
value
ham     4825
spam     747
Name: count, dtype: int64

Tuščios reikšmės:
value      0
message    0
dtype: int64


In [None]:
# Papildomi duomenys ("length" ir "punct_count") geresniam modeliui

In [16]:
import re
punctuation_pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~]'

df['length'] = df['message'].astype(str).apply(len)
df['punct_count'] = df['message'].apply(lambda x: len(re.findall(punctuation_pattern, x)))

# atkomentuoti kai prireiks išsaugoti duomenis į failą
# df.to_csv("./dataset-spam-updated.csv", index=False, encoding="latin1")

In [17]:
# Modelio treniravimas

In [18]:
from sklearn.model_selection import train_test_split

X = df[['message','length', 'punct_count']]
Y = df['value']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y)

# print('Mokymo duomenys: ', X_train.shape)
# print('Testavimo duomenys: ', X_test.shape)

In [19]:
# "Atraminių vektorių klasifikatorius"

In [61]:
import scipy.sparse as sp
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

vectorizer = TfidfVectorizer()
X_train_message = vectorizer.fit_transform(X_train['message'])
X_test_message = vectorizer.transform(X_test['message'])

X_train_combined = sp.hstack([X_train_message, sp.csr_matrix(X_train[['length', 'punct_count']].values)])
X_test_combined = sp.hstack([X_test_message, sp.csr_matrix(X_test[['length', 'punct_count']].values)])

svc_model = SVC(gamma="scale")
svc_model.fit(X_train_combined, Y_train)

predictions = svc_model.predict(X_test_combined)
accuracy = metrics.accuracy_score(Y_test, predictions)

print(f"Modelio tikslumas: {accuracy:.3f}%") #TODO ant tikslumo reikės padirbėti

Modelio tikslumas: 0.863%


In [62]:
# Spam ir Ham aptikimo funkcija

def find_ham_or_spam(message):
    
    message_vector = vectorizer.transform([message])
    message_length = len(message)
    message_punct_count = len(re.findall(punctuation_pattern, message))

    vectorized_message_values = sp.hstack([
        message_vector,
        sp.csr_matrix([[message_length, message_punct_count]])
    ])

    print(svc_model.predict(vectorized_message_values))

In [None]:
# Žinučių testavimas

In [63]:
text = "Free a..      84912#!@#!@#^ @3 i3$%n 2 a wkly co..,mp to win Fasd././1#!@#A Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's!@#!@#!@#######################!!!!!!!!!!!!!!!!!!!!!!@#!@#%%%%%%%%%%%%!@#$!@#$"

find_ham_or_spam(text)

['ham']
