In [1]:
import numpy as np
import pandas as pd
import string
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
import itertools
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


#Przygotowanie danych tekstowych#

In [2]:
spam_dataset = pd.read_csv('/content/spam.csv', encoding = 'ISO-8859-1', usecols = [0,1], names = ['Spam', 'Text'], skiprows = 1)
spam_dataset['Spam'] = spam_dataset['Spam'].replace(['ham', 'spam'], [0, 1])
spam_dataset.head(5)

  spam_dataset['Spam'] = spam_dataset['Spam'].replace(['ham', 'spam'], [0, 1])


Unnamed: 0,Spam,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print(spam_dataset['Spam'].value_counts(normalize = True))

Spam
0    0.865937
1    0.134063
Name: proportion, dtype: float64


#Usuwanie znaków interpunkcyjnych#

In [4]:
def remove_puncation(text):
  cleaned = ''.join([word for word in text if word not in string.punctuation])
  return cleaned

spam_dataset['Cleaned_Text'] = spam_dataset['Text'].apply(lambda x: remove_puncation(x))
spam_dataset.head(5)

Unnamed: 0,Spam,Text,Cleaned_Text
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


#Tokenizacja#

In [5]:
def tokenize(text):
  #usuniecie wielkich liter
  clean_text = text.lower()
  #Tokenizacja
  tokenized_text = nltk.word_tokenize(clean_text)
  return tokenized_text

spam_dataset['Tokenized_Text'] = spam_dataset['Cleaned_Text'].apply(lambda x: tokenize(x))
spam_dataset.head(5)

Unnamed: 0,Spam,Text,Cleaned_Text,Tokenized_Text
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


#Usuwanie stopwords#

In [6]:
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
  without_stopwords = [word for word in text if word not in stopwords]
  return without_stopwords

spam_dataset['Without_Stopwords'] = spam_dataset['Tokenized_Text'].apply(lambda x: remove_stopwords(x))
spam_dataset.head(5)

Unnamed: 0,Spam,Text,Cleaned_Text,Tokenized_Text,Without_Stopwords
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t..."


#Lemantyzacja#

In [7]:
lemmater = nltk.WordNetLemmatizer()

def lemmatizing(text):
  lemmatized_words = [lemmater.lemmatize(word) for word in text]
  return lemmatized_words

spam_dataset['Lemmatized_Text'] = spam_dataset['Without_Stopwords'].apply(lambda x: lemmatizing(x))
spam_dataset.head(5)

Unnamed: 0,Spam,Text,Cleaned_Text,Tokenized_Text,Without_Stopwords,Lemmatized_Text
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n...","[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t...","[nah, dont, think, go, usf, life, around, though]"


#Wektoryzacja z najwazniejszymi cechami#

In [30]:
tidf = TfidfVectorizer(max_df = 0.5, min_df = 2)
X = tidf.fit_transform(spam_dataset['Lemmatized_Text'].apply(lambda x: ''.join(x)))
y = spam_dataset['Spam']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 42)

In [35]:
clf = RandomForestClassifier(max_depth = 2, random_state = 0)
clf.fit(X_train, y_train)

In [38]:
print(clf.score(X_train, y_train))

0.865897435897436


In [39]:
print(clf.score(X_test, y_test))

0.8660287081339713


In [None]:
# Pobranie feature importance
importances = clf.feature_importances_

# Pobranie nazw cech
feature_names = tidf.get_feature_names_out()

# Tworzenie DataFrame z cechami i ich znaczeniem
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [26]:
selected_features = feature_importance_df[feature_importance_df["Importance"] > 0.001]["Feature"].tolist()
selected_features

['call',
 'txt',
 'free',
 'claim',
 'mobile',
 'service',
 'prize',
 'reply',
 'win',
 'text',
 'stop',
 'cash',
 'urgent',
 'new',
 'per',
 'ur',
 'contact',
 'send',
 'get',
 'show',
 'please',
 'message',
 'week',
 'min',
 'im',
 'day',
 'ill',
 'find',
 'go',
 'next',
 'phone',
 'later',
 'msg',
 'want',
 'today',
 'see',
 'got',
 'number',
 'year',
 'time',
 'every',
 'ltgt',
 'back',
 'dont',
 'like',
 'pls',
 'ok',
 'sorry',
 'hi',
 'know',
 'wan',
 'dear',
 'one',
 'take',
 'come',
 'give',
 'well',
 'love',
 'na',
 'sent',
 'friend',
 'tell',
 'pick',
 'care',
 'hey',
 'good',
 'around',
 'need',
 'night',
 'make',
 'money',
 'still',
 'also',
 'cant',
 'let',
 'think',
 'work',
 'great',
 'babe',
 'hope',
 'thing',
 'tomorrow',
 'going',
 'yes',
 'home',
 'ive',
 'sure',
 'miss',
 'wait',
 'way',
 'much',
 'place']

In [27]:
pipeline2 = Pipeline([
    ('tfidf', TfidfVectorizer(vocabulary = selected_features, min_df = 0.02, max_df = 0.9)),
    ('clf', RandomForestClassifier())
])

list(pipeline2.get_params().keys())

['memory',
 'steps',
 'transform_input',
 'verbose',
 'tfidf',
 'clf',
 'tfidf__analyzer',
 'tfidf__binary',
 'tfidf__decode_error',
 'tfidf__dtype',
 'tfidf__encoding',
 'tfidf__input',
 'tfidf__lowercase',
 'tfidf__max_df',
 'tfidf__max_features',
 'tfidf__min_df',
 'tfidf__ngram_range',
 'tfidf__norm',
 'tfidf__preprocessor',
 'tfidf__smooth_idf',
 'tfidf__stop_words',
 'tfidf__strip_accents',
 'tfidf__sublinear_tf',
 'tfidf__token_pattern',
 'tfidf__tokenizer',
 'tfidf__use_idf',
 'tfidf__vocabulary',
 'clf__bootstrap',
 'clf__ccp_alpha',
 'clf__class_weight',
 'clf__criterion',
 'clf__max_depth',
 'clf__max_features',
 'clf__max_leaf_nodes',
 'clf__max_samples',
 'clf__min_impurity_decrease',
 'clf__min_samples_leaf',
 'clf__min_samples_split',
 'clf__min_weight_fraction_leaf',
 'clf__monotonic_cst',
 'clf__n_estimators',
 'clf__n_jobs',
 'clf__oob_score',
 'clf__random_state',
 'clf__verbose',
 'clf__warm_start']

In [43]:
X = spam_dataset['Lemmatized_Text'].apply(lambda x: ' '.join(x))
y = spam_dataset['Spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

params = {
    'tfidf__max_df' : [0.5, 0.75, 1.0],
    'tfidf__min_df' : [1, 2, 3],
    'tfidf__ngram_range' : [(1,1), (1,2), (1,3)],
    'clf__max_depth' : [2, 3, 4],
    'clf__n_estimators' : [10, 50, 100],
    'clf__min_samples_leaf' : [2, 3, 4 , 5]
}

grid_search = GridSearchCV(pipeline_tfidf, params, cv=3, verbose=1, n_jobs=-1, error_score='raise')
grid_search.fit(X_train, y_train)

print('Wybrane hiperparametry: ', grid_search.best_params_)

Model1 = grid_search.best_estimator_

Fitting 3 folds for each of 972 candidates, totalling 2916 fits
Wybrane hiperparametry:  {'clf__max_depth': 4, 'clf__min_samples_leaf': 2, 'clf__n_estimators': 10, 'tfidf__max_df': 1.0, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 1)}


In [45]:
print(classification_report(y_test, Model1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94       966
           1       1.00      0.11      0.20       149

    accuracy                           0.88      1115
   macro avg       0.94      0.56      0.57      1115
weighted avg       0.90      0.88      0.84      1115

