In [1]:
import pandas as pd
import numpy as np
import string
import nltk
import itertools
from wordcloud import WordCloud 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
pd.set_option('future.no_silent_downcasting', True)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jtadych/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jtadych/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jtadych/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
spam_dataset = pd.read_csv('spam.csv', encoding="ISO-8859-1", usecols=[0, 1], names=['Spam', 'Text'], skiprows=1)
spam_dataset['Spam'] = spam_dataset['Spam'].replace(['ham', 'spam'], [0, 1])

In [8]:
def remove_punctation(text):
    return ''.join([word for word in text if word not in string.punctuation])
spam_dataset['Cleaned_Text'] = spam_dataset['Text'].apply(lambda x: remove_punctation(x))

def remove_punctation(text):
    return ''.join([word for word in text if word not in string.punctuation])
spam_dataset['Cleaned_Text'] = spam_dataset['Text'].apply(lambda x: remove_punctation(x))

def tokenize(text):

    # Usunięcie wielkich liter
    clean_text = text.lower()

    # Tokenizacja
    tokenized_text = nltk.word_tokenize(clean_text)
    return tokenized_text

spam_dataset['Tokenized_Text'] = spam_dataset['Cleaned_Text'].apply(lambda x: tokenize(x))

stopwords = nltk.corpus.stopwords.words("english")
def remove_stopwords(text):
    without_stopwords = [word for word in text if word not in stopwords]
    return without_stopwords
spam_dataset['WithoutStop_Text'] = spam_dataset['Tokenized_Text'].apply(lambda x: remove_stopwords(x))

stemmer = nltk.PorterStemmer()
def stemming(text):
    stemmed_words = [stemmer.stem(word) for word in text]
    return stemmed_words
spam_dataset['Stemmed_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: stemming(x))

lemmater = nltk.WordNetLemmatizer()
def lemmatizing(text):
    lemmatized_words = [lemmater.lemmatize(word) for word in text]
    return lemmatized_words
spam_dataset['Lemmatized_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: lemmatizing(x))

In [9]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(spam_dataset['Lemmatized_Text'].apply(lambda x: ' '.join(x)))
print(X.shape)
y = spam_dataset['Spam']
print(y.shape)

(5572, 8843)
(5572,)


In [23]:
from sklearn.model_selection import train_test_split
# Podział na trening/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
y_test = y_test.astype(int)

In [24]:
from sklearn.ensemble import RandomForestClassifier

# Convert continuous labels to discrete classes
y_train = [int(label) for label in y_train]

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.8660594695954996

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
y_pred = clf.predict(X_test)


In [26]:
# Metryki klasyfikacji
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

# Wyświetlenie metryk
print("Metryki klasyfikacji:")
print(f"Accuracy  : {acc:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")
print(f"F1-score  : {f1:.4f}")


Metryki klasyfikacji:
Accuracy  : 0.8657
Precision : 0.0000
Recall    : 0.0000
F1-score  : 0.0000


In [27]:
# Macierz pomyłek
cm = confusion_matrix(y_test, y_pred)
print(" Macierz pomyłek: ")
print(cm)


🧩 Macierz pomyłek:
[[1592    0]
 [ 247    0]]
