## Penjelasan Dataset yang digunakan

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Data Understanding

In [4]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Semester_7/NLP/spam.csv'
try:
    df = pd.read_csv(file_path, encoding='latin-1')
except FileNotFoundError:
    print("File tidak ditemukan. Pastikan path file sudah benar.")
    df = pd.read_csv('spam.csv', encoding='latin-1')
    print("File berhasil dibaca.")
df = df.iloc[:, [0, 1]]
df.columns = ['Label', 'Teks']
print("5 baris pertama data:")
print(df.head())
print("\nInformasi Data:")
df.info()
df['Label'] = df['Label'].map({'ham': 0, 'spam': 1})
print("\nDistribusi Label:")
print(df['Label'].value_counts())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File tidak ditemukan. Pastikan path file sudah benar.
File berhasil dibaca.
5 baris pertama data:
  Label                                               Teks
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Informasi Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   Teks    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB

Distribusi Label:
Label
0    4825
1     747
Name: count, dtype: int64


## Data Text Processing

In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)
df['Teks_Bersih'] = df['Teks'].apply(preprocess_text)
print("\nContoh Teks Setelah Pra-pemrosesan:")
print(df[['Teks', 'Teks_Bersih', 'Label']].head())


Contoh Teks Setelah Pra-pemrosesan:
                                                Teks  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                         Teks_Bersih  Label  
0  go jurong point crazy available bugis n great ...      0  
1                            ok lar joking wif u oni      0  
2  free entry 2 wkly comp win fa cup final tkts 2...      1  
3                u dun say early hor u c already say      0  
4        nah dont think goes usf lives around though      0  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)
df['Teks_Bersih'] = df['Teks'].apply(preprocess_text)
print("\nContoh Teks Setelah Pra-pemrosesan:")
print(df[['Teks', 'Teks_Bersih', 'Label']].head())


Contoh Teks Setelah Pra-pemrosesan:
                                                Teks  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                         Teks_Bersih  Label  
0  go jurong point crazy available bugis n great ...      0  
1                            ok lar joking wif u oni      0  
2  free entry 2 wkly comp win fa cup final tkts 2...      1  
3                u dun say early hor u c already say      0  
4        nah dont think goes usf lives around though      0  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Modeling

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = df['Teks_Bersih']
y = df['Label']
X_transformed = tfidf_vectorizer.fit_transform(X)
print("\nBentuk Matriks Fitur (Baris, Kolom):", X_transformed.shape)


Bentuk Matriks Fitur (Baris, Kolom): (5572, 5000)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.2, random_state=42
)

model = MultinomialNB()
model.fit(X_train, y_train)

print("\nModel Naive Bayes berhasil dilatih.")


Model Naive Bayes berhasil dilatih.


## Data Evaluasi

In [10]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"\n✅ Akurasi Model: {accuracy*100:.2f}%")
print("\nMatriks Kebingungan (Confusion Matrix):")
print(conf_matrix)
print("\nLaporan Klasifikasi:")
print(class_report)


✅ Akurasi Model: 97.85%

Matriks Kebingungan (Confusion Matrix):
[[966   0]
 [ 24 125]]

Laporan Klasifikasi:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

