In [2]:
import pandas as pd
import numpy as np
import string
import nltk
import itertools
from wordcloud import WordCloud 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.preprocessing import FunctionTransformer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
pd.set_option('future.no_silent_downcasting', True)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/jtadych/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jtadych/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jtadych/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
spam_dataset = pd.read_csv('spam.csv', encoding="ISO-8859-1", usecols=[0, 1], names=['Spam', 'Text'], skiprows=1)
spam_dataset['Spam'] = spam_dataset['Spam'].replace(['ham', 'spam'], [0, 1])


In [4]:
def remove_punctation(text):
    return ''.join([word for word in text if word not in string.punctuation])
spam_dataset['Cleaned_Text'] = spam_dataset['Text'].apply(lambda x: remove_punctation(x))

In [5]:
def tokenize(text):

    # Usunięcie wielkich liter
    clean_text = text.lower()

    # Tokenizacja
    tokenized_text = nltk.word_tokenize(clean_text)
    return tokenized_text
spam_dataset['Tokenized_Text'] = spam_dataset['Cleaned_Text'].apply(lambda x: tokenize(x))

In [6]:
stopwords = nltk.corpus.stopwords.words("english")
def remove_stopwords(text):
    return [word for word in text if word not in stopwords]
spam_dataset['WithoutStop_Text'] = spam_dataset['Tokenized_Text'].apply(lambda x: remove_stopwords(x))


In [7]:
stemmer = nltk.PorterStemmer()
def stemming(text):
    stemmed_words = [stemmer.stem(word) for word in text]
    return stemmed_words
spam_dataset['Stemmed_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: stemming(x))


In [8]:
lemmater = nltk.WordNetLemmatizer()
def lemmatizing(text):
    lemmatized_words = [lemmater.lemmatize(word) for word in text]
    return lemmatized_words
spam_dataset['Lemmatized_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: lemmatizing(x))

In [9]:
X = spam_dataset['Lemmatized_Text']
y = spam_dataset['Spam']
# Convert continuous labels to discrete classes
y = [int(label) for label in y]

In [10]:
# Funkcja: lista tokenów -> tekst
text_joiner = FunctionTransformer(lambda x: [' '.join(tokens) for tokens in x], validate=False)


In [11]:
# Pipeline bazowy
pipeline = Pipeline([
    ('join', text_joiner),
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Trenuj pierwszy model
pipeline.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [78]:
# Wyciągnięcie cech i ich ważności
vectorizer = pipeline.named_steps['tfidf']
rf = pipeline.named_steps['rf']
features = vectorizer.get_feature_names_out()
importances = rf.feature_importances_

importance_df = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

In [79]:
# Selekcja cech
selected_features = importance_df[importance_df['importance'] > 0.001]['feature'].tolist()
print("Wybrane cechy:", selected_features)

Wybrane cechy: ['txt', 'call', 'free', 'claim', 'mobile', 'service', 'stop', 'prize', 'text', 'reply', 'urgent', 'tone', 'win', '500', 'customer', 'tc', '18', 'guaranteed', 'nokia', '1000', 'contact', 'voucher', '16', 'new', 'cash', 'ringtone', 'uk', 'rate', 'awarded', '100', 'landline', '150', 'code', 'collection', 'line', '800', '150ppm', '150p', 'per', 'mob', 'orange', 'dating', '150pmsg', 'video', 'offer', '86688', 'chat', 'draw', 'tscs', 'latest', 'message', 'min', '250', 'club', 'receive', 'private', '2000', 'apply', '5000', 'cost', 'collect', 'eg', 'winner', 'please', 'camera', 'weekly', 'send', 'content', 'sm', 'unsubscribe', 'chance', 'box', 'user', '0800', 'ur', 'identifier', 'pound', 'auction', 'await', '08000930705', 'ltd', 'delivery', 'pobox', 'credit', 'selected', 'attempt', 'po', 'holiday', 'bonus', 'operator', 'expires', 'std', 'opt', '750', 'poly', 'wap', 'wkly', '62468', 'mobileupd8', 'msg', 'award', 'sex', 'im', 'waiting', 'live', 'tried', '1st', 'quiz', 'land', 'awa

In [80]:
# Nowy pipeline z ograniczonym słownikiem TF-IDF
pipeline_selected = Pipeline([
    ('join', text_joiner),
    ('tfidf', TfidfVectorizer(vocabulary=selected_features)),
    ('rf', RandomForestClassifier(random_state=42))
])

In [83]:
# GridSearchCV
param_grid = {
    'rf__n_estimators': [50, 100],
    'rf__max_depth': [None, 5],
    'rf__min_samples_split': [2, 4],
}

grid = GridSearchCV(pipeline_selected, param_grid, cv=2, scoring='accuracy')
grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'rf__max_depth': [None, 5], 'rf__min_samples_split': [2, 4], 'rf__n_estimators': [50, 100]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,2
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,func,<function <la...t 0x177034860>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,4
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [84]:
# Ocena
print("Najlepsze parametry:", grid.best_params_)
y_pred = grid.predict(X_test)
print("Raport klasyfikacji:\n", classification_report(y_test, y_pred))

Najlepsze parametry: {'rf__max_depth': None, 'rf__min_samples_split': 4, 'rf__n_estimators': 100}
Raport klasyfikacji:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       966
           1       0.96      0.88      0.92       149

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

