---

Perform text preprocessing, remove unnecessary information

---

In [1]:
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"

In [2]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem import SnowballStemmer

import nltk
nltk.download('stopwords')

df = pd.read_table('SMS.tsv')

df = df.sample(n=1000, random_state=0)

def preprocess_text(text):
    text = re.sub(regex, '', text)

    text = re.sub(r'\d+', '', text)

    text = text.lower()

    text = text.translate(str.maketrans('', '', string.punctuation))

    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    stemmer = SnowballStemmer('english')
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    return text

df['text'] = df['text'].apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(df['text'])

print(X.shape)

df.head(10)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rusla\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


(1000, 500)


Unnamed: 0,class,text
4456,ham,storm msg wen u lift phne u say hello u knw wt...
690,spam,forward pleas call immedi urgent messag wait
944,ham,also ive sorta blown coupl time recent id rath...
3768,ham,sir goodmorn free call
1189,ham,come alivebett correct good look figur
4437,ham,housemaid murder coz man murder ltgt th januar...
3587,spam,hot n horni will live local text repli hear st...
1982,ham,sorri ill call later meet thing relat trade pl...
2038,ham,oh sorri pleas
2078,ham,hey hunonbus goin meet want go meal donyt feel...


In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])
y = df['class']

df.head(10)

Unnamed: 0,class,text
4456,0,storm msg wen u lift phne u say hello u knw wt...
690,1,forward pleas call immedi urgent messag wait
944,0,also ive sorta blown coupl time recent id rath...
3768,0,sir goodmorn free call
1189,0,come alivebett correct good look figur
4437,0,housemaid murder coz man murder ltgt th januar...
3587,1,hot n horni will live local text repli hear st...
1982,0,sorri ill call later meet thing relat trade pl...
2038,0,oh sorri pleas
2078,0,hey hunonbus goin meet want go meal donyt feel...


---

Embedded FE

RidgeCV

---

In [4]:
from sklearn.linear_model import RidgeCV

ridge = RidgeCV()
ridge.fit(X, y)

feature_coefs = zip(vectorizer.get_feature_names_out(), ridge.coef_)

sorted_features_embedded = sorted(feature_coefs, key=lambda x: abs(x[1]), reverse=True)

for feature, coef in sorted_features_embedded[:30]:
    print(f"Feature: {feature:20} Coefficient: {coef:.5f}")

Feature: txt                  Coefficient: 1.04421
Feature: servic               Coefficient: 0.72743
Feature: claim                Coefficient: 0.57399
Feature: contact              Coefficient: 0.57327
Feature: mobil                Coefficient: 0.56827
Feature: latest               Coefficient: 0.54384
Feature: win                  Coefficient: 0.54348
Feature: secret               Coefficient: 0.51059
Feature: text                 Coefficient: 0.49402
Feature: award                Coefficient: 0.46896
Feature: nokia                Coefficient: 0.46248
Feature: call                 Coefficient: 0.45917
Feature: uk                   Coefficient: 0.45722
Feature: free                 Coefficient: 0.44202
Feature: stop                 Coefficient: 0.43524
Feature: voucher              Coefficient: 0.43345
Feature: tone                 Coefficient: 0.42937
Feature: new                  Coefficient: 0.41546
Feature: custom               Coefficient: 0.41165
Feature: rate                 C

In [8]:
import numpy as np

def add_intercept(X):
    intercept = np.ones((X.shape[0], 1))
    return np.concatenate((intercept, X.toarray()), axis=1)

def ridge_regression(X, y, alpha):
    XtX = X.T.dot(X)
    XtX_alpha = XtX + alpha * np.identity(XtX.shape[0])
    XtX_inv = np.linalg.inv(XtX_alpha)
    Xty = X.T.dot(y)
    w = XtX_inv.dot(Xty)
    return w[1:], w[0]

In [9]:
alpha = 1.0

coefficients, intercept = ridge_regression(add_intercept(X), y, alpha)

feature_coefs = zip(vectorizer.get_feature_names_out(), coefficients)

sorted_features_embedded = sorted(feature_coefs, key=lambda x: abs(x[1]), reverse=True)

for feature, coef in sorted_features_embedded[:30]:
    print(f"Feature: {feature:20} Coefficient: {coef:.5f}")

print(f"Intercept: {intercept:.5f}")

Feature: txt                  Coefficient: 1.04442
Feature: servic               Coefficient: 0.72760
Feature: claim                Coefficient: 0.57412
Feature: contact              Coefficient: 0.57334
Feature: mobil                Coefficient: 0.56845
Feature: latest               Coefficient: 0.54394
Feature: win                  Coefficient: 0.54371
Feature: secret               Coefficient: 0.51062
Feature: text                 Coefficient: 0.49421
Feature: award                Coefficient: 0.46912
Feature: nokia                Coefficient: 0.46255
Feature: call                 Coefficient: 0.45953
Feature: uk                   Coefficient: 0.45725
Feature: free                 Coefficient: 0.44219
Feature: stop                 Coefficient: 0.43546
Feature: voucher              Coefficient: 0.43354
Feature: tone                 Coefficient: 0.42948
Feature: new                  Coefficient: 0.41562
Feature: custom               Coefficient: 0.41183
Feature: rate                 C

---

Wrapper FE

Backward elimination

---

In [10]:
import statsmodels.api as sm

def backward_elimination(data, target, significance_level=0.05):
    features = data.columns.tolist()

    while len(features) > 0:
        features_with_constant = sm.add_constant(data[features])

        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        
        if p_values.max() >= significance_level:
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature)
            print(f'Len of features {len(features)}, removing {excluded_feature}')
        else:
            break

    return features

In [11]:
df_vectorized = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())

backward_selected_features = backward_elimination(df_vectorized, list(y))

sorted_backward_features = sorted(backward_selected_features, key=lambda x: df_vectorized[x].std(), reverse=True)

print("Top 30 Backward Selected Features:")
print(sorted_backward_features[:30])

Len of features 499, removing last
Len of features 498, removing kid
Len of features 497, removing thank
Len of features 496, removing player
Len of features 495, removing though
Len of features 494, removing prob
Len of features 493, removing yes
Len of features 492, removing start
Len of features 491, removing awesom
Len of features 490, removing tmr
Len of features 489, removing enough
Len of features 488, removing lunch
Len of features 487, removing lar
Len of features 486, removing today
Len of features 485, removing smoke
Len of features 484, removing de
Len of features 483, removing thought
Len of features 482, removing havent
Len of features 481, removing face
Len of features 480, removing frm
Len of features 479, removing half
Len of features 478, removing fetch
Len of features 477, removing got
Len of features 476, removing da
Len of features 475, removing recent
Len of features 474, removing suppos
Len of features 473, removing lot
Len of features 472, removing school
Len of

---

Filter FE

Коэффициент корреляции (Пирсона или Спирмена)

---

In [12]:
import numpy as np

def my_pearsonr(x, y):
    if len(x) != len(y):
        raise ValueError("Input arrays must have the same length.")
    
    x = np.asarray(x)
    y = np.asarray(y)

    x_mean = np.mean(x)
    y_mean = np.mean(y)

    x_std = np.std(x)
    y_std = np.std(y)

    covariance = np.mean((x - x_mean) * (y - y_mean))

    correlation = covariance / (x_std * y_std)

    return correlation

In [13]:
import pandas as pd

correlations = []
for feature_idx in range(X.shape[1]):
    feature = X[:, feature_idx].toarray().flatten()
    correlation = my_pearsonr(feature, y)
    correlations.append(correlation)

features_df = pd.DataFrame({'Feature': vectorizer.get_feature_names_out(), 'Correlation': correlations})

features_df = features_df.sort_values(by='Correlation', ascending=False)

features_df.head(30)

Unnamed: 0,Feature,Correlation
445,txt,0.435425
138,free,0.339904
252,mobil,0.330881
49,call,0.314752
64,claim,0.306839
277,nokia,0.264016
322,prize,0.254495
23,award,0.252392
476,win,0.246747
369,servic,0.246364


---

Python Libraries feature selection 

---

In [14]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

selectors = [
    ("SelectKBest", SelectKBest(score_func=chi2, k=30)),
    ("SelectFromModel", SelectFromModel(estimator=RandomForestClassifier(random_state=0), max_features=30)),
    ("RFE", RFE(LogisticRegression(), n_features_to_select=30))
]

for name, selector in selectors:
    X_new = selector.fit_transform(X, y)

    selected_features = selector.get_support(indices=True)
    feature_names = vectorizer.get_feature_names_out()

    print(f"Selected features using {name}:")
    print(feature_names[selected_features])

Selected features using SelectKBest:
['award' 'box' 'call' 'camera' 'claim' 'contact' 'custom' 'deliveri'
 'free' 'guarante' 'landlin' 'latest' 'line' 'mobil' 'new' 'nokia' 'offer'
 'pmin' 'pobox' 'poli' 'prize' 'rate' 'repli' 'servic' 'stop' 'text'
 'tone' 'txt' 'urgent' 'win']
Selected features using SelectFromModel:
['award' 'box' 'call' 'chat' 'claim' 'contact' 'custom' 'free' 'landlin'
 'latest' 'line' 'messag' 'mobil' 'new' 'nokia' 'offer' 'pmin' 'pobox'
 'prize' 'rate' 'repli' 'servic' 'stop' 'text' 'tone' 'txt' 'ur' 'urgent'
 'voucher' 'win']
Selected features using RFE:
['award' 'box' 'call' 'chat' 'claim' 'contact' 'custom' 'free' 'ill'
 'landlin' 'latest' 'line' 'min' 'mobil' 'new' 'nokia' 'pobox' 'privat'
 'prize' 'rate' 'repli' 'servic' 'stop' 'text' 'tone' 'txt' 'ur' 'urgent'
 'voucher' 'win']


---

Compare classification before and after applying feature selection methods

---

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [17]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

models = [
    ("SVC", SVC()),
    ("DecisionTreeClassifier", DecisionTreeClassifier()),
    ("KNeighborsClassifier", KNeighborsClassifier()),
]

def performance():
  print("Before:")
  calc_acc(X_train, X_test)

  for name_sel, selector in selectors:
    X_train_new = selector.fit_transform(X_train, y_train)

    X_test_new = selector.transform(X_test)

    print(f"\nAfter for {name_sel} method:")
    calc_acc(X_train_new, X_test_new)

def calc_acc(X_train, X_test):
  for name, model in models:
    model.fit(X_train, y_train)
    y_pred_before = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_before)
    print(f"Accuracy for {name}: {accuracy}")

performance()

Before:
Accuracy for SVC: 0.9533333333333334
Accuracy for DecisionTreeClassifier: 0.9233333333333333
Accuracy for KNeighborsClassifier: 0.86

After for SelectKBest method:
Accuracy for SVC: 0.95
Accuracy for DecisionTreeClassifier: 0.9266666666666666
Accuracy for KNeighborsClassifier: 0.91

After for SelectFromModel method:
Accuracy for SVC: 0.94
Accuracy for DecisionTreeClassifier: 0.9266666666666666
Accuracy for KNeighborsClassifier: 0.91

After for RFE method:
Accuracy for SVC: 0.95
Accuracy for DecisionTreeClassifier: 0.9166666666666666
Accuracy for KNeighborsClassifier: 0.9
