We will research the Gaussian approximation by Random Kitchen Sinks. Let's change the formulas a little, but keep the same meaning :
$$phi(x) = (
\cos (w_1^T x + b_i),
\dots,
\cos (w_n^T x + b_n)
),$$
where $w_j \sim \mathcal{N}(0, \sqrt 1/s))$, $b_j \sim U[-\pi, \pi]$.
We will build linear models with the new features $ phi (x) $.
### Algorithm base
You will need to implement the following algorithm:
1. Downgrade the sample to a new dimension using PCA.
2. For the obtained sample, estimate the hyperparameter $ s ^ 2 $ using the median value.
3. Generate n_features of sets of weights $ w_j $ and shifts $ b_j $, and then generate n_features of new features according to our formulas.
4. Train a linear model (we will use logistic regression and SVM) on new features and predict. Don't forget to use transform() on the test set.

PS: We will also compare in quality with LGBMClassifier both on fashion_mnist dataset and on the dataset from Kaggle (https://www.kaggle.com/sulianova/cardiovascular-disease-dataset)!

In [1]:
import numpy as np
import pandas as pd
import random
import keras
from keras.datasets import fashion_mnist
from sklearn.decomposition import PCA
from itertools import combinations,starmap,product
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from pygame import mixer
from time import sleep
from tqdm import tqdm
import sys
import warnings 
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
warnings.filterwarnings('ignore')

pygame 2.0.1 (SDL 2.0.14, Python 3.8.0)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
mixer.init() #you must initialize the mixer
alert=mixer.Sound('C:\\Users\\1\\Desktop\\ds\\add\\output\\add_mode\\IPython\\lib\\tests\\test.wav')

In [3]:
(x_train_pics, y_train), (x_test_pics, y_test) = fashion_mnist.load_data()
x_train = x_train_pics.reshape(x_train_pics.shape[0],-1)
x_test = x_test_pics.reshape(x_test_pics.shape[0],-1)

In [4]:
class ALG_with_RBFSampler:
    
    def __init__(self, n_components=50, n_features=1000, sigma=None,pca=True):
        if pca:
            self.pca = PCA(n_components)
        self.n_component = n_components
        self.n_features = n_features
        self.sigma = sigma

    def sigma_estimation(self, X, features):
#         print('Sigma_estimation has started!')
        sel = X[np.random.choice(X.shape[0], size=features, replace=False), :]
        comba = np.array(list(starmap(lambda a,b: (a-b)**2, list(combinations(sel,2)))))
        self.sigma = np.median(np.sum(comba ,axis=1))
#         print('Sigma_estimation has finished!')
        return self.sigma 
    
    def fit(self, X, y, method=None):
        if hasattr(self, 'pca'):
#             print('PCA has started')
            X = self.pca.fit(X).transform(X)
#             print('PCA has finished!')
        mu, sigma = 0, 1/self.sigma_estimation(X,1400) # N^2/2 ~ 1000000
        phi = self.phi_calculation(X, mu, sigma**0.5)
#         print('Fitting')
        if method == 'lsvc':
            self.clf = LinearSVC(random_state=0, tol=1e-5,verbose=False)
        elif method == 'lgr':
            self.clf = LogisticRegression(max_iter = 200,verbose=False)
        self.clf.fit(phi,y)
        return self
    
    def predict(self,X):
#         print('Prediction has started!')
        if hasattr(self, 'pca'):
            X = self.pca.transform(X)
        phi = self.phi_calculation(X, 0, self.sigma**0.5)
#         print('Prediction has finished!')
        return self.clf.predict(phi)
        
    def phi_calculation(self, X, mu, sigma):
        if not hasattr(self, 'w') :
            self.w = np.random.normal(mu, sigma, (X.shape[1],self.n_features))
            self.b = np.random.uniform(low=-np.pi, high= np.pi,  size = self.n_features)
#         print('np.dot has started!')
        phi = np.cos((np.dot(X,self.w))+self.b)
#         print('np.dot has finished!')
        return phi

In [5]:
def results(y_test,prediction):
    return (prediction == y_test).sum() / len(y_test)

In [6]:
%%time
n_feat = [1000,1500,2000,3000]
pr_with_pca =[]
with tqdm(total=len(n_feat),file=sys.stdout) as pbar:
    for i in n_feat:
        prediction = ALG_with_RBFSampler(pca=True,n_features = i).fit(x_train,y_train,method='lgr').predict(x_test)
        pr_with_pca.append(results(y_test,prediction))
        pbar.write('processed: %d' % (1 + i))
        pbar.update(1)
        sleep(1)
# alert.play() 

processed: 1001                                                                                                        
processed: 1501                                                                                                        
processed: 2001                                                                                                        
processed: 3001                                                                                                        
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:02<00:00, 90.75s/it]
Wall time: 6min 3s


<Channel at 0x18beaadb490>

In [7]:
%%time
n_feat = [1000,1500,2000,3000]
pr_without_pca = []
with tqdm(total=len(n_feat),file=sys.stdout) as pbar:
    for i in n_feat:
        prediction = ALG_with_RBFSampler(pca=False,n_features = i).fit(x_train,y_train,method='lgr').predict(x_test)
        pr_without_pca.append(results(y_test,prediction))
        pbar.write('processed: %d' % (1 + i))
        pbar.update(1)
        sleep(1)
# alert.play() 

processed: 1001                                                                                                        
processed: 1501                                                                                                        
processed: 2001                                                                                                        
processed: 3001                                                                                                        
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:02<00:00, 15.61s/it]
Wall time: 1min 2s


<Channel at 0x18b83d0d7b0>

In [8]:
%%time
pca = PCA(n_components=50)
pca_train_data = pca.fit(x_train).transform(x_train)
clf = LGBMClassifier(num_leaves=200,verbose = 1).fit(pca_train_data,y_train)
pca_test_data = pca.transform(x_test)
prediction = clf.predict(pca_test_data)
pr_with_pca_boost = results(y_test,prediction)
# alert.play() 

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 50
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
Wall time: 25.7 s


<Channel at 0x18b83d0d190>

In [9]:
%%time
prediction = LGBMClassifier(num_leaves=100,verbose = 1).fit(x_train,y_train).predict(x_test)
pr_without_pca_boost = results(y_test,prediction)
# alert.play() 

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170881
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 783
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
Wall time: 2min 43s


<Channel at 0x18b83d0dc30>

In [10]:
prediction = ALG_with_RBFSampler(pca=True).fit(x_train,y_train,method='lsvc').predict(x_test)
pr_with_pca_svc = results(y_test,prediction)
# alert.play(3) 

<Channel at 0x18b83d0d650>

In [11]:
print(f'pr_with_pca = {pr_with_pca}')
print(f'pr_without_pca = {pr_without_pca}')
print(f'pr_with_pca_boost = {pr_with_pca_boost}')
print(f'pr_without_pca_boost = {pr_without_pca_boost}')
print(f'pr_with_pca_svc = {pr_with_pca_svc}')

pr_with_pca = [0.8704, 0.8704, 0.8716, 0.8733]
pr_without_pca = [0.1116, 0.1165, 0.1146, 0.1168]
pr_with_pca_boost = 0.8771
pr_without_pca_boost = 0.8982
pr_with_pca_svc = 0.8705


Let's test our algorithm on the kaggle dataset!

In [13]:
df = pd.read_csv('cardio_train.csv',sep=';')

In [14]:
df["years"] = df.age / 365
target = df['cardio']
X = df.drop(['id','cardio','age'],axis = 1)

In [15]:
# no null values in the data
X.isna().mean().sort_values(ascending=False)

gender         0.0
height         0.0
weight         0.0
ap_hi          0.0
ap_lo          0.0
cholesterol    0.0
gluc           0.0
smoke          0.0
alco           0.0
active         0.0
years          0.0
dtype: float64

In [16]:
x_train,x_test,y_train,y_test = train_test_split(X,target,test_size=0.3,random_state = 42)

In [17]:
%%time
param = {
    "boosting_type": ['gbdt','dart','goss','rf'],
    "n_estimators": np.arange(1, 10, 10),
    "num_leaves": np.arange(100, 1000, 100),
    "max_depth": np.arange(4,10)
}
pipeline_clf = GridSearchCV(LGBMClassifier(), param, verbose=1)
pipeline_clf.fit(x_train, y_train)
best_parameters= pipeline_clf.best_params_
print(best_parameters)
y_preds = pipeline_clf.predict(x_test)
print(f'result = {results(y_test,y_preds)}')
class_report = classification_report(y_test, y_preds)
print(class_report)
# alert.play() 

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
{'boosting_type': 'gbdt', 'max_depth': 7, 'n_estimators': 1, 'num_leaves': 100}
result = 0.7335238095238096
              precision    recall  f1-score   support

           0       0.71      0.79      0.75     10461
           1       0.76      0.68      0.72     10539

    accuracy                           0.73     21000
   macro avg       0.74      0.73      0.73     21000
weighted avg       0.74      0.73      0.73     21000

Wall time: 1min


<Channel at 0x18b81795310>

In [18]:
n_feat = list(np.arange(1000, 6000, 1000))
n_comp = list(np.arange(1,11,2))
param = np.array(list(product(n_feat, n_comp)))
pr_with_pca = []
predictions_matrix = []
with tqdm(total=param.shape[0],file=sys.stdout) as pbar:
    for f, c in param:
        print(f,c)
        prediction = ALG_with_RBFSampler(pca=True, n_components = c , n_features = f).fit(x_train,y_train,method='lgr').predict(x_test)
        pr_with_pca.append(results(y_test,prediction))
        predictions_matrix.append(prediction)
        pbar.write(f'{i} step: %d' % (1 + param.shape[0]))
        pbar.update(1)
        sleep(1)

  0%|                                                                                           | 0/25 [00:00<?, ?it/s]1000 1
3000 step: 26                                                                                                          
  4%|███▎                                                                               | 1/25 [00:18<07:15, 18.15s/it]1000 3
3000 step: 26                                                                                                          
  8%|██████▋                                                                            | 2/25 [00:37<07:11, 18.78s/it]1000 5
3000 step: 26                                                                                                          
 12%|█████████▉                                                                         | 3/25 [00:56<06:58, 19.03s/it]1000 7
3000 step: 26                                                                                                          
 16%|███████████

In [97]:
best_res_index = pr_with_pca.index(max(pr_with_pca))
class_report = classification_report(y_test, predictions_matrix[19])
print(class_report)
print(f'results {pr_with_pca[best_res_index]}')
# alert.play(5) 

              precision    recall  f1-score   support

           0       0.72      0.76      0.74     10461
           1       0.75      0.71      0.73     10539

    accuracy                           0.74     21000
   macro avg       0.74      0.74      0.74     21000
weighted avg       0.74      0.74      0.74     21000

results 0.738904761904762


In [104]:
Parameters = pd.DataFrame({'n_comp':param[:,1],'n_feat':param[:,0],'pr_with_pca':pr_with_pca})

In [105]:
import plotly.express as px
fig = px.scatter_3d(Parameters, x='n_comp', y='n_feat', z='pr_with_pca',
              color='pr_with_pca',template = "plotly_dark",title = 'Parameters')
fig.show()