# Natural Language Processing ü§ñ

## Case 1: Text Mining

a. Aplikasi yang dibangun harus memanfaatkan sejumlah teknik pemrosesan teks, 
termasuk tokenisasi, penghapusan kata umum (stop words), stemming atau 
lemmatize, POS (Part-of-Speech) Tagging, NER (Named Entity Recognition), 
distribusi frekuensi, pengambilan korpora dari data NLTK atau situs web, 
pemanfaatan WordNet, ekstraksi fitur, klasifikasi menggunakan metode Na√Øve 
Bayes, serta kemampuan untuk menyimpan dan memuat model klasifikasi.
<br><br>
b. Model yang dihasilkan dari arsitektur Na√Øve Bayes harus mencapai tingkat akurasi 
minimal sebesar 80%.
<br><br>
c. Aplikasi juga diharapkan mampu menampilkan 5 most informative features dari 
dataset yang digunakan.

## 0. Import Library

In [1]:
import pandas as pd

import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk import ne_chunk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

import spacy
import joblib
import contractions
import string
import re


## 1. Download Necessary NLTK Datasets

In [29]:
# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')
# nltk.download('maxent_ne_chunker_tab')

## 2. Load Dataset

In [2]:
# Load dataset

df = pd.read_csv('SMS_train.csv', encoding='ISO-8859-1')
print(df.head())

   S. No.                                       Message_body     Label
0       1                         Rofl. Its true to its name  Non-Spam
1       2  The guy did some bitching but I acted like i'd...  Non-Spam
2       3  Pity, * was in mood for that. So...any other s...  Non-Spam
3       4               Will √º b going to esplanade fr home?  Non-Spam
4       5  This is the 2nd time we have tried 2 contact u...      Spam


In [3]:
# Cek apakah ada NaN atau None dalam kolom Message_body

df['Message_body'].isnull().sum()  # Akan memberi tahu jika ada nilai yang hilang


# Jika ada, bisa menghapus baris yang mengandung NaN

df = df.dropna(subset=['Message_body'])

## 3. Preprocessing Text (Stopwords, Stemming, and Lemmatizing)

In [4]:
# Function for Preprocess

def preprocess(text):

    # Tokenisasi

    tokens = word_tokenize(text.lower())  # Mengubah teks menjadi huruf kecil dan tokenisasi

    # Menghapus tanda baca

    tokens = [word for word in tokens if word.isalpha()]

    # Stopwords

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if word.isalpha()]

    # Stemming
    
    stemmer = PorterStemmer()
    s = [stemmer.stem(word) for word in tokens]

    # Lemmatizing

    lemmatizer = WordNetLemmatizer()

    l = [lemmatizer.lemmatize(word) for word in tokens]

    # return " ".join(tokens)
    return " ".join(s + l)


In [5]:
# Apply preprocessing

df['processed_text'] = df['Message_body'].apply(preprocess)

## 4. Naive Bayes Classification

In [6]:
# Feature extraction using Bag of Words model

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['processed_text'])

# Target variable

y = df['Label'].apply(lambda x: 1 if x == 'Spam' else 0)  # 1 = Spam, 0 = Non-Spam


# Split dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Initialize and train Naive Bayes classifier


nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [7]:
# Predict on test data

y_pred = nb_classifier.predict(X_test)

# Evaluate model

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 89.58%


## 5. Conclusion (5 Informative Features)

In [8]:
# Library

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Misalkan df['processed_text'] berisi teks yang sudah diproses dan df['Label'] berisi label (Spam/Non-Spam)


X = df['processed_text']
y = df['Label']


# Split data menjadi train dan test


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Vectorisasi data teks


vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)


# Membuat dan melatih model Naive Bayes


nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)


# Menghitung akurasi


y_pred = nb_classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi: {accuracy*100:.2f}%\n")


# Menampilkan 5 fitur paling informatif


feature_names = vectorizer.get_feature_names_out()
sorted_coef_index = nb_classifier.feature_log_prob_[1].argsort()  # Menggunakan feature_log_prob_ untuk MultinomialNB

top_features = [(feature_names[i], nb_classifier.feature_log_prob_[1][i]) for i in sorted_coef_index[:5]]


# Printing


print("5 Fitur Paling Informatif:\n")

# print(top_features)

for feature in top_features:
    print(feature)


Akurasi: 94.10%

5 Fitur Paling Informatif:

('aah', -8.411388132519262)
('peach', -8.411388132519262)
('paul', -8.411388132519262)
('patrick', -8.411388132519262)
('patient', -8.411388132519262)


### <b>Explanation</b>

Akurasi = (Jumlah Prediksi Benar) / (Jumlah Total Prediksi)

#### Struktur output:

**[('aah', -8.411388132519262), ('peach', -8.411388132519262) ...]**:
- Setiap item dalam list tersebut adalah sebuah tuple yang berisi **(fitur, koefisien)**.
- **Fitur** adalah kata atau istilah yang ditemukan dalam teks, yang berperan dalam pembuatan prediksi. Dalam hal ini, fitur adalah kata-kata seperti `'aah'`, `'patrick'`, `'patient'`, dsb.
- **Koefisien** adalah nilai yang menunjukkan **seberapa besar kontribusi fitur tersebut** terhadap prediksi. Koefisien ini diperoleh dari model Naive Bayes.

Koefisien yang negatif menunjukkan bahwa kata tersebut lebih sering muncul dalam pesan-pesan Spam dibandingkan dengan pesan-pesan Non-Spam.

## 6. Save the Model

In [9]:
# Save the model to a file

joblib.dump(nb_classifier, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [10]:
# To load the model and vectorizer later

model = joblib.load('spam_classifier_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

In [11]:
print(df['processed_text'][:5])

0                        rofl true name rofl true name
1    guy bitch act like interest buy someth els nex...
2               piti mood suggest pity mood suggestion
3    √º b go esplanad fr home √º b going esplanade fr...
4    time tri contact u pound prize claim easi call...
Name: processed_text, dtype: object


## 7. Frequency Distribution

In [12]:
def preprocess_nlp(sentences):

    preprocessed_words = []
    preprocessed_sentences = []
    
    # Tokenizing
    # Membagi teks ke dalam suatu token (word, symbol)

    for sentence in sentences:
        
        sentence = contractions.fix(sentence.replace("\n", ". ").replace(". .", '. '))
        
        # Make all lowercase

        sentence = sentence.lower()
        
        list_sentences = sent_tokenize(sentence)
        list_word = word_tokenize(sentence)
        
        # Remove unneccessary symbol

        list_word = [re.sub(r"[^a-zA-Z0-9\s/.]", '', word) for word in list_word]
        list_word = list(filter(None, list_word))
        
        # Stop words

        eng_stopwords = stopwords.words("english")
        list_word = [word for word in list_word if word.lower() not in eng_stopwords]
        
        # Remove punctuation

        list_word = [word for word in list_word if word not in string.punctuation]
        

        # Stemming

        porter_stemmer = PorterStemmer()
        snowball_stemmer = SnowballStemmer("english")
        lancaster_stemmer = LancasterStemmer()
        
        list_word = [lancaster_stemmer.stem(word) for word in list_word]
        

        # Lemmatizing

        wnl = WordNetLemmatizer()
        # list_word = [wnl.lemmatize(word, pos='r') for word in list_word]
        # print(list_word)
        

        # POS Tagging

        tagged = pos_tag(list_word)

        # print(tagged)
        
        # Named Entity Recognition

        ner = ne_chunk(tagged)
            
        for w in list_word:
            preprocessed_words.append(w)
        
        for s in list_sentences:
            preprocessed_sentences.append(s)
    return preprocessed_sentences, preprocessed_words


#### Simple Notes

- PorterStemmer() - Algoritma lambat, akurasi bagus
- SnowballStemmer("english") - PorterStemmer V2 (dalam beberapa bahasa)
- LancasterStemmer() - Algoritmanya cepat, akurasi kurang bagus

In [13]:
df['processed_text']

0                          rofl true name rofl true name
1      guy bitch act like interest buy someth els nex...
2                 piti mood suggest pity mood suggestion
3      √º b go esplanad fr home √º b going esplanade fr...
4      time tri contact u pound prize claim easi call...
                             ...                        
952    how favourit person today r u workin hard coul...
953                     much got clean much got cleaning
954    sorri da gone mad mani pend work sorry da gone...
955                  wat time √º finish wat time √º finish
956                               glad talk glad talking
Name: processed_text, Length: 957, dtype: object

In [14]:
# Frequency Distribution

ps1, pw1 = preprocess_nlp(df['processed_text'][:10]) # Hanya mengambil 10 row, karena prosesnya agak lambat
print("Senteces and Words:")
print(ps1)
print(pw1)


Senteces and Words:
['rofl true name rofl true name', 'guy bitch act like interest buy someth els next week gave us free guy bitching acted like interested buying something else next week gave you free', 'piti mood suggest pity mood suggestion', '√º b go esplanad fr home √º b going esplanade fr home', 'time tri contact you pound prize claim easi call per minut time tried contact you pound prize claim easy call per minute', 'remind get pound free call credit detail great offer pl repli text valid name hous postcod reminder get pound free call credit detail great offer pls reply text valid name house postcode', 'huh lei huh lei', 'wait least wednesday see get wait least wednesday see get', 'ard like dat lor ard like dat lor', 'ok lor soni ericsson salesman ask shuhui say quit gd use consid ok lor sony ericsson salesman ask shuhui say quite gd use considering']
['rofl', 'tru', 'nam', 'rofl', 'tru', 'nam', 'guy', 'bitch', 'act', 'lik', 'interest', 'buy', 'some', 'el', 'next', 'week', 'gav'

In [15]:
fd1 = FreqDist(pw1)
print(f"Frequency Distribution (word, frequency):")
for word, count in fd1.most_common(10):
    print(f"{word}: {count}")

Frequency Distribution (word, frequency):
nam: 4
lik: 4
fre: 4
pound: 4
cal: 4
get: 4
lor: 4
us: 3
rofl: 2
tru: 2


## 8. NER, POS Tagging, and WordNet

In [16]:
# Load spaCy model for NER

nlp = spacy.load("en_core_web_sm")


# NER (Named Entity Recognition) menggunakan spaCy


def perform_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


# POS Tagging menggunakan NLTK


def perform_pos_tagging(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    return tagged_tokens


# WordNet untuk sinonim dan antonim


def get_wordnet_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())  # Menambahkan sinonim
    return list(synonyms)


# Menampilkan hasil NER, POS Tagging dan WordNet untuk beberapa contoh kalimat

print("NER, POS Tagging, and WordNet Results:\n")

for sentence in df['processed_text'][20:25]:

    print(f"\nAnalyzing: {sentence}")
    
    # NER (Named Entity Recognition)

    entities = perform_ner(sentence)
    print("\nNamed Entities:", entities)
    print("")
    
    # POS Tagging

    pos_tags = perform_pos_tagging(sentence)
    print("POS Tags:", pos_tags)
    
    # WordNet (Synonyms)

    print("\nWORDNET\n")

    words = sentence.split()
    for word in words:
        synonyms = get_wordnet_synonyms(word)
        if synonyms:
            print(f"Synonyms for '{word}': {synonyms}")
    
    print("\n" + "-"*50 + "\n")

NER, POS Tagging, and WordNet Results:


Analyzing: wen get spiritu deep great wen get spiritual deep great

Named Entities: [('wen', 'PERSON')]

POS Tags: [('wen', 'NN'), ('get', 'VB'), ('spiritu', 'JJ'), ('deep', 'JJ'), ('great', 'JJ'), ('wen', 'JJ'), ('get', 'VB'), ('spiritual', 'JJ'), ('deep', 'JJ'), ('great', 'JJ')]

WORDNET

Synonyms for 'wen': ['steatocystoma', 'sebaceous_cyst', 'pilar_cyst', 'wen']
Synonyms for 'get': ['fetch', 'nonplus', 'dumbfound', 'induce', 'grow', 'sustain', 'suffer', 'mother', 'incur', 'buzz_off', 'flummox', 'convey', 'acquire', 'pay_off', 'have', 'pose', 'set_out', 'father', 'contract', 'scram', 'generate', 'stick', 'obtain', 'aim', 'stupefy', 'pay_back', 'beat', 'let', 'start', 'engender', 'develop', "get_under_one's_skin", 'beget', 'baffle', 'come', 'make', 'mystify', 'vex', 'catch', 'start_out', 'cause', 'bring', 'find', 'amaze', 'bring_forth', 'set_about', 'experience', 'puzzle', 'commence', 'take', 'fix', 'stimulate', 'capture', 'drive', 'perplex', 

### Case 1 Criteria

- 1#Classification with Naive Bayes: Accuracy already more than 80% (Done)
- 1#Frequency Distribution: Done
- 1#NER: Done
- 1#POS Tagging: Done
- 1#Stemming and Lemmatizing: Done
- 1#Text Preprocessing: Done
- 1#WordNet: Done

<code>Made by: NW25-1</code>