In [None]:
import sys
!rsync -ah --progress ../input/rapids/rapids.0.16.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!rsync -ah --progress /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns
import scipy.stats as stats
import sklearn
import os

import time

import sklearn.neighbors
import sklearn.svm
import sklearn.ensemble
from sklearn.model_selection import KFold

import cudf
import cuml

import matplotlib.pyplot as plt
import numpy as np

#Rapids 
import cudf
from cuml import LogisticRegression as cLogisticRegression
from cuml.neighbors import KNeighborsClassifier as cKNeighborsClassifier
from cuml import SVC as cSVC
from cuml.linear_model import Lasso as cLasso
from cuml.manifold import TSNE as cTSNE
from cuml import DBSCAN as cDBSCAN
from cuml.decomposition import PCA as cPCA
from cuml.ensemble import RandomForestClassifier as cRandomForestClassifier


## Import data

In [None]:
raw_data = pd.read_csv("/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv", sep=";")
raw_data.drop('id',axis=1,inplace=True)
raw_data.age = np.round(raw_data.age/365.25,decimals=1)
raw_data.gender = raw_data.gender.replace(2,0)
raw_data.head()

## Target balancing

In [None]:
sns.set_style('darkgrid')
sns.countplot(raw_data.cardio,palette='dark')
plt.xlabel('Target values',fontdict={'fontsize': 15,'color':'Blue'},labelpad=3);

## Split

In [None]:
data = pd.read_csv("/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv", sep=";")
data.head()

data.drop(['id'] ,axis = 1 ,inplace = True)

y = data.cardio.values
# CARDİO out of Data 
x = data.drop(['cardio'],axis = 1 )


from sklearn.model_selection import train_test_split 
x_train , x_test ,y_train , y_test =train_test_split(x , y  , test_size = 0.2 , random_state = 42 )


X_all = pd.DataFrame(x_train)
y_all = pd.DataFrame(y_train).astype("int32")



X_all_gpu = cudf.from_pandas(X_all)
y_all_gpu = cudf.from_pandas(y_all)

## Parameters

In [None]:
ITERATION = 2
NFOLDS = 5

## Function fit

In [None]:
def bench(X, y, classifiers, params):
    elapsed = {}
    for name, clf_class in classifiers.items():
        elapsed_list = []

        for _ in range(ITERATION):
            kf = KFold(n_splits=NFOLDS)
            clf = clf_class()
            clf.set_params(**params[name])

            elapsed_sum = 0
            for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
                X_train = X_all.iloc[train_idx]
                y_train = y_all.iloc[train_idx]
                X_val = X_all.iloc[val_idx]
                y_val = y_all.iloc[val_idx]

                start = time.time()
                clf.fit(X_train, y_train)
                elapsed_sum += time.time() - start

            elapsed_list.append(elapsed_sum)

        elapsed[name] = pd.Series(elapsed_list).mean()
    return elapsed


## Fit Scikit-Learn

In [None]:
classifiers = {
    "KNN": sklearn.neighbors.KNeighborsClassifier,
    "SVM": sklearn.svm.SVC,
    "RandomForest": sklearn.ensemble.RandomForestClassifier
}

params = {
    "KNN": {},
    "SVM": {
        "random_state": 47
    },
    "RandomForest": {
        "n_estimators": 100,
        "random_state": 47
    }
}

elapsed_sklearn = bench(X_all, y_all, classifiers, params)

In [None]:
elapsed_sklearn

## Fit cuML

In [None]:
classifiers = { "KNN": cuml.neighbors.KNeighborsClassifier, "SVM": cuml.svm.SVC, "RandomForest": cuml.ensemble.RandomForestClassifier }

params = { "KNN": {}, "SVM": {}, "RandomForest": { "n_estimators": 100 } }

elapsed_cuml = bench(X_all_gpu, y_all_gpu, classifiers, params)

In [None]:
elapsed_cuml

## cuML vs Scikit-Learn 

In [None]:
left = np.arange(len(elapsed_sklearn.keys()))
width = 0.3

fig = plt.figure(figsize=(6, 6))
fig.patch.set_alpha(1)

plt.subplot(1, 1, 1)

plt.bar(left, elapsed_sklearn.values(), color='b', width=width, label="scikit-learn", align="center")
plt.bar(left + width, elapsed_cuml.values(), color="g", width=width, label="cuML", align="center")

plt.xticks(left + width / 2, elapsed_sklearn.keys())
plt.legend(loc=2)
plt.ylabel("sec / iter")
plt.title("fit() performance")
plt.show()

In [None]:
import numpy as np
from cuml.ensemble import RandomForestClassifier as cuRFC

cuml_model = cuRFC(n_estimators=100)
cuml_model.fit(X_all_gpu,y_all_gpu)
#cuml_predict = cuml_model.predict(X_all_gpu)

#print("Predicted labels : ", cuml_predict)

In [None]:
import numpy as np

import pandas as pd
import cudf #cuDF - RAPIDS's GPU DataFrame library

num_features = 1500
num_samples = 5000
data = np.sin(np.arange(num_samples*num_features)).reshape(num_samples,num_features)
noise = np.random.normal(0, 1, num_samples*num_features).reshape(num_samples,num_features)
data += noise

df_pdf = pd.DataFrame(data)

df_cdf = cudf.from_pandas(df_pdf)

from sklearn.model_selection import KFold

def metric(y_true, y_pred):
    return np.mean(np.power(y_true - y_pred,2), axis=0)

In [None]:
%%time

import cupy
from cuml import SVR as cuSVR

NUM_FOLDS = 7
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)

df = df_cdf #cudf dataframe
y = np.zeros(df.shape[0])

for f, (train_ind, val_ind) in enumerate(kf.split(df)):

    train_df = df.iloc[train_ind]
    val_df = df.iloc[val_ind]

    train_target = train_df.loc[:,0]#use the first column as target
    val_target = val_df.loc[:,0]

    #fit
    model = cuSVR(gamma = 'scale', cache_size=3000.0)
    model.fit(train_df.loc[:,1:], train_target)

    #predict
    pred = model.predict(val_df.loc[:,1:])
    y[val_ind] = pred   

    current_score = metric(val_target.values, pred.values)
    print(f"Fold {f} score: {current_score}")

score = metric(cupy.asarray(df.loc[:,0].values), y)

print(f"Average score: {score}")