#**IMPLEMENTASI PROGRAM**

##**Dataset** : https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

##Load Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns                       #visualisation
import matplotlib.pyplot as plt             #visualisation
from google.colab import drive
%matplotlib inline     
sns.set(color_codes=True)

drive.mount('/content/drive')
file_url = ('/content/drive/My Drive/Colab Notebooks/SMSSpamCollection.txt')

messages = pd.read_csv(file_url, sep = '\t', header=None, names=["category", "text"])
messages.head()

Mounted at /content/drive


Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


##PRE-PROCESSING

###Stopword Removal

In [None]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

print(stopwords[:5])
print(punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['i', 'me', 'my', 'myself', 'we']
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


###Case Folding

In [None]:
def pre_process(text):
    remove_punct = "".join([word.lower() for word in text if word not in punctuation])
    tokenize = nltk.tokenize.word_tokenize(remove_punct)
    remove_stopwords = [word for word in tokenize if word not in stopwords]
    return remove_stopwords

#Menambahkan kolom ke data dengan pesan diproses
messages['processed'] = messages['text'].apply(lambda x: pre_process(x))

print(messages['processed'].head())

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, 2, wkly, comp, win, fa, cup, fin...
3        [u, dun, say, early, hor, u, c, already, say]
4    [nah, dont, think, goes, usf, lives, around, t...
Name: processed, dtype: object


In [None]:
def categorize_words():
    spam_words = []
    ham_words = []
    #Menangani pesan yang terkait dengan spam
    for text in messages['processed'][messages['category'] == 'spam']:
        for word in text:
            spam_words.append(word)
    #Menangani pesan yang terkait dengan ham
    for text in messages['processed'][messages['category'] == 'ham']:
        for word in text:
            ham_words.append(word)
    return spam_words, ham_words

spam_words, ham_words = categorize_words()

print(spam_words[:5])
print(ham_words[:5])

['free', 'entry', '2', 'wkly', 'comp']
['go', 'jurong', 'point', 'crazy', 'available']


###Predict Function

In [None]:
def predict(text):
    spam_counter = 0
    ham_counter = 0
    #Hitung kemunculan setiap kata dalam string sms
    for word in text:
        spam_counter += spam_words.count(word)
        ham_counter += ham_words.count(word)
    print('***HASIL***')
    #Jika pesan adalah ham
    if ham_counter > spam_counter:
        accuracy = round((ham_counter / (ham_counter + spam_counter) * 100))
        print('Pesan bukan spam, dengan kepastian {}% '.format(accuracy))
    #Jika pesannya bisa sama-sama spam dan ham
    elif ham_counter == spam_counter:
        print('Pesan mungkin merupakan spam')
    #Jika pesan adalah spam
    else:
        accuracy = round((spam_counter / (ham_counter + spam_counter)* 100))
        print('Pesan adalah spam, dengan kepastian {}% '.format(accuracy))

In [None]:
import re

In [None]:
# Daftar kata gaul/slang
slang_words = ['wassup', 'fo shizzle', 'totes', 'yolo']

# Menyusun pola ekspresi reguler untuk mencocokkan kata-kata slang
pattern = re.compile('|'.join(slang_words))

# Berfungsi untuk mengklasifikasikan teks sebagai spam atau bukan spam
def classify_text(text):
    # Search for slang words in the text
    slang_words_found = pattern.search(text)
    
    # Jika ditemukan kata-kata slang, klasifikasikan teks tersebut sebagai spam
    if slang_words_found:
        return 'ham'

In [None]:
print(classify_text('Hey, wassup?'))

ham


###Testing

In [None]:
user_input = input("Masukkan teks yang akan di uji : \n")
#pre-processing input sebelum prediksi
processed_input = pre_process(user_input)

predict(processed_input)