# Подготовка датасета

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

n_features = 30
df = pd.read_csv('SMS.tsv', sep='\t')
y = df['class'].apply(lambda y: 1 if y == 'ham' else -1)
vectorizer = TfidfVectorizer(min_df=0.005, stop_words=nltk.corpus.stopwords.words('english'))
features = vectorizer.fit_transform(df.iloc[:, 1])
X = pd.DataFrame(features.todense(), columns=vectorizer.get_feature_names_out())

# Собственные реализации

## Встроенный метод выбора признаков

In [6]:
from feature_extraction import EmbeddedFeatureSelector

selector = EmbeddedFeatureSelector(lr=0.001, epochs=1000)
embedded_features = selector.select_features(X, y, n_features)
print(embedded_features)

['gt' 'lt' 'da' 'lor' '150p' 'later' 'uk' 'www' '500' 'wat' 'gonna'
 'claim' 'yeah' 'remember' '18' 'lol' 'co' 'ask' 'yup' '1000' 'anything'
 'said' 'told' 'lunch' 'sure' 'something' 'prize' 'morning' 'tone' 'amp']


## Метод-обёртка выбора признаков

In [7]:
from feature_extraction import WrapperFeatureSelector

selector = WrapperFeatureSelector()
wrapper_features = selector.select_features(X, y, n_features)
print(wrapper_features)

['txt', 'claim', 'mobile', 'www', 'service', '150p', '16', 'video', '50', 'landline', 'ringtone', 'prize', 'code', 'nokia', 'rate', 'uk', '500', 'lt', '18', 'problem', 'apply', 'cos', 'free', 'got', 'pay', 'line', 'min', 'done', 'quite', 'evening']


## Метод-фильтр выбора признаков

In [8]:
from feature_extraction import FilterFeatureSelector

selector = FilterFeatureSelector()
filter_features = selector.select_features(X, y, n_features)
print(filter_features)

['1st' '150ppm' 'network' 'rate' 'landline' 'texts' '16' 'apply' 'cs'
 'video' '1000' 'po' 'orange' '500' 'selected' 'guaranteed' 'awarded'
 'box' 'latest' 'camera' 'leh' 'kiss' 'easy' '2nd' 'everything' 'offer'
 'afternoon' '18' '100' 'code']


# Библиотечные реализации

## Встроенный метод выбора признаков

In [10]:
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier

selector = SelectFromModel(estimator=DecisionTreeClassifier(), max_features=n_features)
selector.fit(X, y)
sklearn_embedded_features = selector.get_feature_names_out()
print(sklearn_embedded_features)

['150p' '18' '50' 'call' 'chat' 'claim' 'da' 'free' 'get' 'got' 'home'
 'lt' 'min' 'mobile' 'new' 'nokia' 'real' 'ringtone' 'service' 'sms'
 'stop' 'sure' 'tell' 'text' 'tv' 'txt' 'uk' 'win' 'work' 'www']


## Метод-обёртка выбора признаков

In [14]:
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC

selector = RFE(estimator=LinearSVC(dual='auto'), n_features_to_select=n_features)
selector.fit(X, y)
sklearn_wrapper_features = selector.get_feature_names_out()
print(sklearn_wrapper_features)

['100' '1000' '150p' '16' '18' '50' '500' 'apply' 'awarded' 'chat' 'claim'
 'code' 'friends' 'gt' 'landline' 'latest' 'mobile' 'nokia' 'orange' 'per'
 'prize' 'rate' 'ringtone' 'service' 'tone' 'txt' 'uk' 'urgent' 'video'
 'www']


## Метод-фильтр выбора признаков

In [15]:
from sklearn.feature_selection import SelectKBest

selector = SelectKBest(k=n_features)
selector.fit(X, y)
sklearn_filter_features = selector.get_feature_names_out()
print(sklearn_filter_features)

['100' '1000' '150p' '150ppm' '16' '18' '50' '500' 'awarded' 'call' 'cash'
 'claim' 'co' 'contact' 'cs' 'free' 'guaranteed' 'mobile' 'nokia' 'prize'
 'reply' 'service' 'stop' 'text' 'tone' 'txt' 'uk' 'urgent' 'win' 'www']


# Сравнение

In [20]:
embedded_intersection = set(embedded_features) & set(sklearn_embedded_features)
wrapper_intersection = set(wrapper_features) & set(sklearn_wrapper_features)
filter_intersection = set(filter_features) & set(sklearn_filter_features)
print(f'Embedded methods coincidence ({len(embedded_intersection)} matched):\n{embedded_intersection}\n')
print(f'Wrapper methods coincidence: ({len(wrapper_intersection)} matched):\n{wrapper_intersection}\n')
print(f'Filter methods coincidence ({len(filter_intersection)} matched):\n{filter_intersection}\n')

Embedded methods coincidence (8 matched):
{'da', 'lt', 'claim', 'www', 'uk', 'sure', '18', '150p'}

Wrapper methods coincidence: (19 matched):
{'code', '16', 'claim', 'www', 'rate', '500', 'landline', 'video', 'nokia', 'prize', 'txt', 'uk', '50', '18', 'mobile', 'apply', 'service', 'ringtone', '150p'}

Filter methods coincidence (9 matched):
{'guaranteed', '16', 'cs', '100', '1000', '500', '150ppm', 'awarded', '18'}


# Изменение качества работы классификаторов

In [36]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

train_size = 0.8
selected_features = {'embedded': embedded_features, 'wrapper': wrapper_features, 'filter': filter_features}
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)

## SGDClassifier

In [37]:
from sklearn.linear_model import SGDClassifier

classifier = SGDClassifier()
classifier.fit(X_train, y_train)
print('Before selection: ', f1_score(y_test, classifier.predict(X_test)))
for (method, features) in selected_features.items():
    X_subset_train = X_train[features]
    X_subset_test = X_test[features]
    classifier.fit(X_subset_train, y_train)
    print(f'After {method} selection: {f1_score(y_test, classifier.predict(X_subset_test))}')

Before selection:  0.9837611314824516
After embedded selection: 0.9530065689742294
After wrapper selection: 0.9750260145681582
After filter selection: 0.9496695475343162


## DecisionTreeClassifier

In [39]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
print('Before selection: ', f1_score(y_test, classifier.predict(X_test)))
for (method, features) in selected_features.items():
    X_subset_train = X_train[features]
    X_subset_test = X_test[features]
    classifier.fit(X_subset_train, y_train)
    print(f'After {method} selection: {f1_score(y_test, classifier.predict(X_subset_test))}')

Before selection:  0.9738493723849372
After embedded selection: 0.9549822964087
After wrapper selection: 0.9755080771235018
After filter selection: 0.9527198779867819


## KNeighborsClassifier

In [43]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_jobs=1)
classifier.fit(X_train, y_train)
print('Before selection: ', f1_score(y_test, classifier.predict(X_test)))
for (method, features) in selected_features.items():
    X_subset_train = X_train[features]
    X_subset_test = X_test[features]
    classifier.fit(X_subset_train, y_train)
    print(f'After {method} selection: {f1_score(y_test, classifier.predict(X_subset_test))}')

Before selection:  0.9661538461538461
After embedded selection: 0.9549822964087
After wrapper selection: 0.9734789391575663
After filter selection: 0.9515058703420113
