# Датасет

In [None]:
!cp ./drive/MyDrive/SMS.tsv ./SMS.tsv

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')

n_features = 30
df = pd.read_csv('SMS.tsv', sep='\t')
y = df['class'].apply(lambda y: 1 if y == 'ham' else -1)
vectorizer = TfidfVectorizer(min_df=0.005, stop_words=nltk.corpus.stopwords.words('english'))
features = vectorizer.fit_transform(df.iloc[:, 1])
X = pd.DataFrame(features.todense(), columns=vectorizer.get_feature_names_out())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
X

Unnamed: 0,10,100,1000,150p,150ppm,16,18,1st,2nd,50,...,would,www,xxx,ya,yeah,year,yes,yet,yo,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.363053,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0




# Реализации

## Losses

In [None]:
import numpy as np


def get_loss(loss_name):
    match loss_name:
        case 'linear':
            return linear_loss
        case 'logarithmic':
            return logarithmic_loss
        case 'square':
            return square_loss
        case 'sigmoid':
            return sigmoid_loss
        case _:
            raise Exception('Unknown loss specified: '
                            'linear, logarithmic, square or sigmoid are available.')


def get_loss_gradient(loss_name):
    match loss_name:
        case 'linear':
            return linear_loss_gradient
        case 'logarithmic':
            return logarithmic_loss_gradient
        case 'square':
            return square_loss_gradient
        case 'sigmoid':
            return sigmoid_loss_gradient
        case _:
            raise Exception('Unknown loss specified: '
                            'linear, logarithmic, square or sigmoid are available.')


def linear_loss(X, y, w):
    value = 0
    n = X.shape[0]
    for i in range(n):
        value += max(0, -np.dot(X.iloc[i], w) * y.iloc[i])
    return value / n


def logarithmic_loss(X, y, w):
    value = 0
    n = X.shape[0]
    for i in range(n):
        value += np.log2(1 + np.exp(-np.dot(X.iloc[i], w) * y.iloc[i]))
    return value / n


def square_loss(X, y, w):
    value = 0
    n = X.shape[0]
    for i in range(n):
        m = np.dot(X.iloc[i], w) * y.iloc[i]
        value += (1 - m) * (1 - m)
    return value / n


def sigmoid_loss(X, y, w):
    value = 0
    n = X.shape[0]
    for i in range(n):
        value += 2 / (1 + np.exp(np.dot(X.iloc[i], w) * y.iloc[i]))
    return value / n


def linear_loss_gradient(X, y, w):
    value = np.zeros(X.shape[1])
    n = X.shape[0]
    for i in range(n):
        if np.dot(X.iloc[i], w) * y.iloc[i] <= 0:
            value -= y.iloc[i] * X.iloc[i]
    return value


def logarithmic_loss_gradient(X, y, w):
    value = np.zeros(X.shape[1])
    n = X.shape[0]
    for i in range(n):
        value += X.iloc[i] * y.iloc[i] / (1 + np.exp(y.iloc[i] * np.dot(X.iloc[i], w)))
    return -value


def square_loss_gradient(X, y, w):
    value = np.zeros(X.shape[1])
    n = X.shape[0]
    for i in range(n):
        value -= 2 * y.iloc[i] * (1 - y.iloc[i] * np.dot(X.iloc[i], w)) * X.iloc[i]
    return value


def sigmoid_loss_gradient(X, y, w):
    value = np.zeros(X.shape[1])
    n = X.shape[0]
    for i in range(n):
        sigmoid = 1 / (1 + np.exp(y.iloc[i] * np.dot(X.iloc[i], w)))
        value -= 2 * y.iloc[i] * sigmoid * (1 - sigmoid) * X.iloc[i]
    return value


## GDClassifier

In [None]:
from numpy import dot

class GDClassifier:
    loss = None
    loss_gradient = None
    l1 = 0
    l2 = 0
    lr = 1e-4
    epochs = 1
    w = None

    def __init__(self, lr, epochs, loss='linear', l1=0, l2=0):
        self.loss = get_loss(loss)
        self.loss_gradient = get_loss_gradient(loss)
        self.l1 = l1
        self.l2 = l2
        self.lr = lr
        self.epochs = epochs

    def fit(self, X, y):
        self.w = np.zeros(X.shape[1])
        for epoch in range(self.epochs):
            gradient = self.loss_gradient(X, y, self.w) + self.l1 * np.sign(self.w) + 2 * self.l2 * self.w
            self.w -= self.lr * gradient

    def predict(self, X):
        return np.array([1 if dot(X.iloc[i], self.w) >= 0 else -1 for i in range(X.shape[0])])

## Features Extraction

In [None]:
from sklearn.metrics import f1_score

class EmbeddedFeatureSelector:
    def __init__(self, lr, epochs, loss='linear', l1=0, l2=0):
        self.classifier = GDClassifier(lr, epochs, loss, l1, l2)

    def select_features(self, X, y, n_features):
        self.classifier.fit(X, y)
        feature_importance = self.classifier.w.apply(abs).sort_values(ascending=False)
        return feature_importance[:n_features].index.values


In [None]:
from sklearn.linear_model import SGDClassifier

class WrapperFeatureSelector:
    def __init__(self):
        self.classifier = SGDClassifier()

    def select_features(self, X, y, n_features):
        features = set(X.columns.values)
        selected_features = []
        for i in range(n_features):
            max_quality = 0
            best_feature = None
            for feature in features:
                X_subset = X[selected_features + [feature]]
                self.classifier.fit(X_subset, y)
                quality = f1_score(y, self.classifier.predict(X_subset))
                if quality > max_quality:
                    max_quality = quality
                    best_feature = feature
            selected_features.append(best_feature)
            features.remove(best_feature)
        return selected_features


In [None]:
class FilterFeatureSelector:
    def select_features(self, X, y, n_features):
        features = X.columns
        conditional_variance = pd.Series(np.zeros(features.size), index=features)
        size = X.shape[0]
        pos_size = y.loc[y == 1].size
        neg_size = size - pos_size
        for feature in features:
            pos_sum = 0
            pos_sqr_sum = 0
            neg_sum = 0
            neg_sqr_sum = 0
            for i in range(size):
                value = X[feature].iloc[i]
                if y.iloc[i] == 1:
                    pos_sum += value
                    pos_sqr_sum += value * value
                else:
                    neg_sum += value
                    neg_sqr_sum += value * value
            pos_variance = pos_sqr_sum / pos_size - (pos_sum / pos_size) ** 2
            neg_variance = neg_sqr_sum / neg_size - (neg_sum / neg_size) ** 2
            conditional_variance[feature] = (pos_variance * pos_size + neg_variance * neg_size) / size
        return conditional_variance.sort_values()[:n_features].index.values

## Примеры работ

In [None]:
selector = EmbeddedFeatureSelector(lr=0.001, epochs=100)
embedded_features = selector.select_features(X, y, n_features)
print(embedded_features)

['gt' 'lt' 'da' 'lor' '150p' 'later' 'ok' 'uk' 'come' 'www' '500' 'wat'
 'gonna' 'way' 'yeah' '18' 'remember' 'lol' 'home' 'claim' 'co' 'ask'
 'yup' '1000' 'happy' 'anything' 'said' 'dear' 'min' 'told']


In [None]:
selector = WrapperFeatureSelector()
wrapper_features = selector.select_features(X, y, n_features)
print(wrapper_features)

['txt', 'claim', 'mobile', 'www', 'service', '150p', '16', '50', 'video', 'landline', 'ringtone', 'uk', 'rate', 'code', 'prize', 'nokia', '500', 'gt', 'ill', '18', 'apply', 'car', 'work', 'pa', 'something', 'afternoon', 'better', 'probably', 'fine', 'leh']


In [None]:
selector = FilterFeatureSelector()
filter_features = selector.select_features(X, y, n_features)
print(filter_features)

['1st' '150ppm' 'network' 'rate' 'landline' 'texts' '16' 'apply' 'cs'
 'video' '1000' 'po' 'orange' '500' 'selected' 'guaranteed' 'awarded'
 'box' 'latest' 'camera' 'leh' 'kiss' 'easy' '2nd' 'everything' 'offer'
 'afternoon' '18' '100' 'code']


# Библиотечные методы

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier

selector = SelectFromModel(estimator=DecisionTreeClassifier(), max_features=n_features)
selector.fit(X, y)
sklearn_embedded_features = selector.get_feature_names_out()
print(sklearn_embedded_features)

['150p' '18' '50' 'call' 'chat' 'claim' 'da' 'free' 'get' 'give' 'got'
 'gt' 'home' 'min' 'mobile' 'new' 'nokia' 'real' 'ringtone' 'service'
 'sms' 'stop' 'tell' 'text' 'txt' 'uk' 'us' 'win' 'work' 'www']


In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC

selector = RFE(estimator=LinearSVC(dual='auto'), n_features_to_select=n_features)
selector.fit(X, y)
sklearn_wrapper_features = selector.get_feature_names_out()
print(sklearn_wrapper_features)

['100' '1000' '150p' '16' '18' '50' '500' 'apply' 'awarded' 'chat' 'claim'
 'code' 'friends' 'gt' 'landline' 'latest' 'mobile' 'nokia' 'orange' 'per'
 'prize' 'rate' 'ringtone' 'service' 'tone' 'txt' 'uk' 'urgent' 'video'
 'www']


In [None]:
from sklearn.feature_selection import SelectKBest

selector = SelectKBest(k=n_features)
selector.fit(X, y)
sklearn_filter_features = selector.get_feature_names_out()
print(sklearn_filter_features)

['100' '1000' '150p' '150ppm' '16' '18' '50' '500' 'awarded' 'call' 'cash'
 'claim' 'co' 'contact' 'cs' 'free' 'guaranteed' 'mobile' 'nokia' 'prize'
 'reply' 'service' 'stop' 'text' 'tone' 'txt' 'uk' 'urgent' 'win' 'www']


# Сравнение

In [None]:
embedded_intersection = set(embedded_features) & set(sklearn_embedded_features)
print(f'Embedded methods - {len(embedded_intersection)} matched:\n{embedded_intersection=}\n')

wrapper_intersection = set(wrapper_features) & set(sklearn_wrapper_features)
print(f'Wrapper methods - {len(wrapper_intersection)} matched:\n{wrapper_intersection=}\n')

filter_intersection = set(filter_features) & set(sklearn_filter_features)
print(f'Filter methods - {len(filter_intersection)} matched:\n{filter_intersection=}\n')

Embedded methods - 9 matched:
embedded_intersection={'150p', 'www', 'min', 'home', 'gt', 'da', 'uk', '18', 'claim'}

Wrapper methods - 20 matched:
wrapper_intersection={'50', 'ringtone', 'prize', 'apply', 'www', 'service', 'rate', 'uk', '150p', 'landline', '500', 'video', 'txt', 'nokia', 'mobile', 'gt', 'code', '16', '18', 'claim'}

Filter methods - 9 matched:
filter_intersection={'cs', 'guaranteed', '500', '100', 'awarded', '16', '18', '1000', '150ppm'}



# Изменение качества

In [None]:
from sklearn.model_selection import train_test_split

train_size = 0.8
selected_features = {'embedded': embedded_features, 'wrapper': wrapper_features, 'filter': filter_features}
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)

In [None]:
classifier = SGDClassifier()
classifier.fit(X_train, y_train)
print(f'Before:  {f1_score(y_test, classifier.predict(X_test))}')
for (method, features) in selected_features.items():
    X_subset_train = X_train[features]
    X_subset_test = X_test[features]
    classifier.fit(X_subset_train, y_train)
    print(f'After {method}: {f1_score(y_test, classifier.predict(X_subset_test))}')

Before:  0.9886714727085479
After embedded: 0.9627791563275434
After wrapper: 0.9831546707503829
After filter: 0.9635182408795602


In [None]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
print(f'Before:  {f1_score(y_test, classifier.predict(X_test))}')
for (method, features) in selected_features.items():
    X_subset_train = X_train[features]
    X_subset_test = X_test[features]
    classifier.fit(X_subset_train, y_train)
    print(f'After {method}: {f1_score(y_test, classifier.predict(X_subset_test))}')

Before:  0.9808587687532333
After embedded: 0.9627051218299354
After wrapper: 0.9811704834605598
After filter: 0.9639278557114228


In [None]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_jobs=1)
classifier.fit(X_train, y_train)
print(f'Before:  {f1_score(y_test, classifier.predict(X_test))}')
for (method, features) in selected_features.items():
    X_subset_train = X_train[features]
    X_subset_test = X_test[features]
    classifier.fit(X_subset_train, y_train)
    print(f'After {method}: {f1_score(y_test, classifier.predict(X_subset_test))}')

Before:  0.9762985375693394
After embedded: 0.9637357178340785
After wrapper: 0.9831546707503829
After filter: 0.9634085213032582
