In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing cuml, cudf and cupy

In [3]:
import cudf as cd
import cupy as cp

Setting train path and test path

In [4]:
train_path = "/kaggle/input/netflix-appetency/train.csv"
test_path = "/kaggle/input/netflix-appetency/test.csv"

Importing train and test data

In [5]:
train_df = cd.read_csv(train_path)
test_df = cd.read_csv(test_path)

In [6]:
train_df.shape

In [7]:
train_df.describe()

Function to calculate percentage of missing value of each feature

In [11]:
def missing_values(df):
    
    missing_columns = list()
    missing_percentage = list()
    
    if df.isna().sum().sum() > 0:
        
        for columns in df.columns:
            
            if df[columns].isnull().sum():
                missing_columns.append(columns)
                missing_percentage.append( ( df[columns].isnull().sum() / len(df[columns] ) ) * 100)
        
        missing_df = pd.DataFrame({'Feature':missing_columns, 'MissingPercent':missing_percentage}).sort_values(by = "MissingPercent", ascending = False)
        return missing_df
    else:
        return False

In [12]:
missing_df = missing_values(train_df)

In [13]:
missing_df

Plotting histogram of percentage of missing value columns

In [14]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
sns.histplot(x=missing_df.MissingPercent).set_title('Missing values distribution',size=15)
plt.ylabel('The number of features')

Dropping features with more than 25% missing values

In [15]:
missing_df = missing_df[missing_df['MissingPercent']>25]

In [16]:
train_df = train_df.drop(list(missing_df['Feature']), axis = 1)
test_df = test_df.drop(list(missing_df['Feature']), axis =1)

In [17]:
train_df.shape, test_df.shape

Separating categorical and numerical features

In [18]:
cat_features = train_df.select_dtypes(include = ['object']).columns

num_features = train_df.select_dtypes(exclude = ['object']).columns[2:]

In [19]:
cat_features, len(num_features)

Filling missing values

In [20]:
for columns in num_features:
    train_df[columns].fillna(train_df[columns].median(), inplace=True)
    test_df[columns].fillna(test_df[columns].median(), inplace=True)

In [21]:
for columns in cat_features:
    train_df[columns].fillna(train_df[columns].mode()[0], inplace = True)
    test_df[columns].fillna(test_df[columns].mode()[0], inplace = True)

Removing columns with high correlation with each other

In [22]:
num_features_target  = train_df.select_dtypes(exclude = ['object']).columns[1:]
df_corr = train_df[num_features_target].corr()
df_corr

In [None]:
def removing_corr(df, th, num_features_target):
    corr = set()
    df_corr = df[num_features_target].corr()
    for i in range(1, len(df_corr.columns)):
        for j in range(1, len(df_corr.columns)):
            if i != j and abs(df_corr.iloc[i, j] >= th):
                corr_i = df_corr.iloc[i, 0]
                corr_j = df_corr.iloc[0, j]
                
                colname_i = df_corr.columns[i]
                colname_j = df_corr.columns[j]
                
                if corr_i >= corr_j and colname_j in df.columns:
                    del df[colname_j]
                    corr.add(colname_j)
                elif corr_j > corr_i and colname_i in df.columns:
                    del df[colname_i]
                    corr.add(colname_i)
    return corr

In [None]:
corr_del = removing_corr(train_df, 0.7, num_features_target)

In [None]:
test_df.drop(corr_del, axis = 1, inplace = True)

In [23]:
train_df = cd.read_csv('../input/train-test/train.csv')
test_df = cd.read_csv('../input/train-test/test.csv')

In [24]:
test_df.shape, train_df.shape

Deleting columns that have only one value

In [25]:
def remove_zero_var(df):
    zero_columns = []
    for column in df.columns:
        if(len(df[column].unique()) == 1):
            df.drop(column, axis=1, inplace=True)
            zero_columns.append(column)
    return zero_columns

In [26]:
zero_columns = remove_zero_var(train_df)

In [27]:
len(zero_columns)

In [28]:
test_df.drop(zero_columns, axis = 1, inplace = True)

In [29]:
train_df.shape, test_df.shape

Checking unique values in each categorical feature

In [30]:
cat_features = train_df.select_dtypes(include = ['object']).columns

In [31]:
for col in cat_features:
    print(f"{col} : {len(train_df[col].unique())}")

Dropping categorical features with a lot of mulitple classes

In [32]:
def remove_multiple(df):
    columns = []
    
    for col in cat_features:
        if(len(df[col].unique()) > 60):
            df.drop(col, axis=1, inplace = True)
            columns.append(col)
    return columns

In [33]:
multiple_class_col = remove_multiple(train_df)

In [34]:
test_df.drop(multiple_class_col, axis = 1, inplace=True)

In [35]:
train_df.shape, test_df.shape

Updating the categorical and numerical features after removal

In [36]:
cat_features = train_df.select_dtypes(include = ['object']).columns
num_features = train_df.select_dtypes(exclude = ['object']).columns[1:]

In [37]:
len(num_features), train_df.shape

Removing columns that have low correlation with target

In [38]:
df_corr = train_df[num_features].corr()

low_corr = list(df_corr[df_corr['target'] < 0.001].index.to_pandas())

In [39]:
train_df.drop(low_corr, axis = 1, inplace=True)
test_df.drop(low_corr, axis = 1, inplace=True)

In [40]:
cat_features = train_df.select_dtypes(include = ['object']).columns
num_features = train_df.select_dtypes(exclude = ['object']).columns[1:]

Label encoding of categorical features

In [41]:
from cuml.preprocessing import LabelEncoder

train_df_encode = train_df.copy()
test_df_encode = test_df.copy()

for column in list(cat_features):
    le = LabelEncoder()
    
    train_df_encode[column] = le.fit_transform(train_df_encode[column])
    test_df_encode[column] = le.fit_transform(test_df_encode[column])

In [42]:
train_df_encode.drop(['id'], axis = 1, inplace = True)

test_id = test_df_encode['id']

test_df_encode.drop(['id'], axis = 1, inplace = True)

In [43]:
X = train_df_encode.iloc[:, 1:]
Y = train_df_encode.iloc[:, 0]

Applying standardization

In [44]:
from cuml.preprocessing import StandardScaler

std = StandardScaler()

X[num_features[1:]] = std.fit_transform(X[num_features[1:]])

test_df_encode[num_features[1:]] = std.fit_transform(test_df_encode[num_features[1:]])

Train validation set split

In [45]:
from cuml.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.3, random_state = 44)

Now for feature extraction we will use autoencoders

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import plot_model

n_inputs = X_train.shape[1]

In [46]:
X_valid.shape, X_train.shape

In [47]:
X_train_numpy = cp.asnumpy(X_train.as_gpu_matrix())
X_valid_numpy = cp.asnumpy(X_valid.as_gpu_matrix())

In [48]:
X_valid_numpy.shape, X_train_numpy.shape

In [None]:
# encoder
visible = Input(shape=(n_inputs,))
# encoder level 1
e = Dense(n_inputs*2)(visible)
e = BatchNormalization()(e)
e = LeakyReLU()(e)
# encoder level 2
e = Dense(n_inputs)(e)
e = BatchNormalization()(e)
e = LeakyReLU()(e)
# bottleneck
n_bottleneck = 50
bottleneck = Dense(n_bottleneck)(e)
# define decoder, level 1
d = Dense(n_inputs)(bottleneck)
d = BatchNormalization()(d)
d = LeakyReLU()(d)
# decoder level 2
d = Dense(n_inputs*2)(d)
d = BatchNormalization()(d)
d = LeakyReLU()(d)
# output layer
output = Dense(n_inputs, activation='linear')(d)
# define autoencoder model
model = Model(inputs=visible, outputs=output)
# compile autoencoder model
model.compile(optimizer='adam', loss='mse')
# fit the autoencoder model to reconstruct input
history = model.fit(X_train_numpy, X_train_numpy, epochs=50, batch_size=20, verbose=2, validation_data=(X_valid_numpy,X_valid_numpy))

In [None]:
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.legend()
plt.show()

Saving the encoder part of Autoencoder

In [None]:
encoder = Model(inputs=visible, outputs=bottleneck)
encoder.save('encoder.h5')

Using the encoder to reduce dimensionality of our data

In [50]:
from tensorflow.keras.models import load_model

encoder = load_model('../input/encoder/encoder.h5')

In [51]:
test_df_encode.shape

In [52]:
X_test = cp.asnumpy(test_df_encode.as_gpu_matrix())

In [53]:
X_test.shape

In [55]:
X_train_reduced = encoder.predict(X_train_numpy)
X_valid_reduced = encoder.predict(X_valid_numpy)
X_test_reduced = encoder.predict(X_test)

In [56]:
X_train_reduced.shape, X_valid_reduced.shape, X_test_reduced.shape

Logistic Regression

In [None]:
X_train_reduced = cd.read_csv('../input/data-science-evaluation/X_train_reduced.csv')
X_valid_reduced = cd.read_csv('../input/data-science-evaluation/X_valid_reduced.csv')
X_test_reduced = cd.read_csv('../input/data-science-evaluation/X_test_reduced.csv')

In [57]:
from cuml.metrics import roc_auc_score

In [58]:
from cuml.linear_model import LogisticRegression


lr = LogisticRegression()

lr.fit(X_train_reduced, y_train)

In [59]:
def get_score(model):
    val_pred = model.predict_proba(X_valid_reduced)
    return roc_auc_score(y_valid, val_pred[:, 1])

In [60]:
get_score(lr)

Random Forest Classifier

In [61]:
from cuml.ensemble import RandomForestClassifier

In [62]:
rf = RandomForestClassifier(n_estimators=650, max_depth = 40, min_samples_split = 4, min_samples_leaf = 2)
rf.fit(X_train_reduced, y_train)

In [63]:
get_score(rf)

Using KNN

In [64]:
from cuml.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(X_train_reduced, y_train)

In [65]:
get_score(knn)

Using Naive Bayes

In [66]:
from cuml.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_reduced, y_train)

In [67]:
get_score(gnb)

Using XGboost

In [68]:
from cuml.common.import_utils import has_xgboost

In [69]:
if has_xgboost():
    import xgboost as xgb
else:
    raise ImportError("Please install xgboost using the conda package,"
                      " Use conda install -c conda-forge xgboost "
                      "command to install xgboost")

In [None]:
params = {'silent': 1, 'eval_metric':'error',
              'objective':'binary:logistic',
              'max_depth': 25}
dtrain = xgb.DMatrix(X_train_reduced, label=y_train)

bst = xgb.train(params, dtrain, 15)


In [77]:
dvalidation = xgb.DMatrix(X_valid_reduced, label=y_valid)
xgb_preds = bst.predict(dvalidation)

In [82]:
roc_auc_score(y_valid, xgb_preds)

Using Logistic Regression to predict on test data

In [85]:
y_test_pred = lr.predict_proba(X_test_reduced)

In [91]:
y_test_pred[:, 1].shape

In [96]:
ans = {'id': test_id.values, 'target': y_test_pred[:, 1]}

ans_df = cd.DataFrame(ans)

In [98]:
ans_df.to_csv('ans.csv', index=False)

In [None]:
!pip install GPUtil

import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()  