In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 - Importing RAPIDS libraries

In [55]:
import cupy, cudf, cuml

In [56]:
data_path = '../input/house-prices-advanced-regression-techniques/'

In [57]:
df = cudf.read_csv(data_path+'train.csv')

 - Viewing Column name, Non-null and Data-types
 - Viewing random 5 samples from Input data

In [58]:
print(df.info())
df.sample(5)

 - Picking only Integer and floating features

In [59]:
discrete = []
for col in df.columns:
    if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
        discrete.append(col)
        
df2 = df[discrete]
df2.info()

 - Number of Null values in each coloumn

In [60]:
df2.isnull().sum()

 - Using Median Imputation to fill NA values

In [61]:
df2.fillna(df2.median(), inplace=True)
df2.isnull().sum()

In [62]:
X, y = df2.iloc[:, 1:-1], df2.iloc[:,-1]
print(X.shape, y.shape)

 - Picking moderately to highly skewed features

In [63]:
skewed_feauture = [col for col in X.columns if abs(X[col].skew())>0.5]
print(len(skewed_feauture), skewed_feauture)

 - Applying log_e(1+x) to skewed features to normalize

In [64]:
# applying log transform to fix skew
# X[skewed_feauture] = X[skewed_feauture].map(lambda x: cupy.log1p(x))
for feature in skewed_feauture:
    X[feature] = cupy.log1p(X[feature])
    

 - Mean-centring the features

In [65]:
from cuml.preprocessing import StandardScaler
ss = StandardScaler()
X_sc = ss.fit_transform(X)

 - Splitting the Dataset into Training and Validation in 90-10 ratio

In [68]:
from cuml.preprocessing import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size=0.1, random_state=3)

- Statistic function for any model, returns [r2_score, mean_abs_error, mean_sq_error]

In [69]:
from cuml.metrics.regression import r2_score, mean_absolute_error, mean_squared_error
def get_LR_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_t = y_test.astype('float64')
    r2, mae, mse = r2_score(y_t, y_pred), mean_absolute_error(y_t, y_pred), mean_squared_error(y_t, y_pred)
    print('R2:', r2)
    print('MAE:', mae)    
    print('MSE:', mse)
    return [r2, mae, mse]

 - Running SVD, EIG, QR, svd-qr, svd-jacobi algo one by one

In [70]:
from cuml.linear_model import LinearRegression
algos = ["svd", "eig", 'qr', "svd-qr", "svd-jacobi"]
models = []
metrics = []
for algo in algos:
    lr = LinearRegression(algorithm=algo)
    %time lr.fit(X_train, y_train)
    models.append(lr)
    metrics.append(get_LR_metrics(lr, X_test, y_test))

 - Creating a table for comparison (all values are equal in this case)

In [71]:
linear_stat = cudf.DataFrame(metrics, columns=['R2', 'MAE', 'MSE'], index=algos)
linear_stat

 - Extrating PCA to data might increase R2 score

In [106]:
from cuml.decomposition import PCA
pca = PCA(n_components=5)
X_r = pca.fit_transform(X_sc)
print(X_r.shape)

In [107]:
X_r = ss.fit_transform(X_r)

In [108]:
X_rtrain, X_rtest, y_rtrain, y_rtest = train_test_split(X_r, y, test_size=0.1, random_state=3)

 - Running Ridge fucntion, on [svd, eig] solvers

In [109]:
from cuml.linear_model import Ridge
algos = ["svd", "eig"]
pca_nl_models = []
pca_nl_metrics = []
for algo in algos:
    lr = Ridge(alpha=1, solver=algo)
    %time lr.fit(X_rtrain, y_rtrain)
    pca_nl_models.append(lr)
    pca_nl_metrics.append(get_LR_metrics(lr, X_rtest, y_rtest))

In [110]:
ridge_stats = cudf.DataFrame(pca_nl_metrics, columns=['R2', 'MAE', 'MSE'], index=['ridge_svd', 'ridge_eig'])
ridge_stats

In [111]:
stats = ridge_stats.append(linear_stat)
stats

 - Using PCA + Ridge with same preprocessing steps on test data

In [128]:
t = cudf.read_csv(data_path+'test.csv')
t.sample(5)

In [129]:
discrete.remove('SalePrice')

In [130]:
test = t[discrete].iloc[:, 1:]

In [131]:
test.isnull().sum()

In [132]:
test.fillna(test.median(), inplace=True)

In [133]:
t_skew_features = [col for col in test.columns if abs(test[col].skew()) > 0.5]
print(len(t_skew_features), t_skew_features)

 - Log transform dataset

In [134]:
for feature in t_skew_features:
    X[feature] = cupy.log1p(X[feature])

In [147]:
X_r = pca.fit_transform(test)
X_test = ss.fit_transform(X_r)
print(X_test.shape)

In [148]:
ridge_model = pca_nl_models[0]
y_pred = ridge_model.predict(X_test)

In [151]:
predictions = {
    'Id': t['Id'].astype('int32'),
    'SalePrice': y_pred,
}
sub = cudf.DataFrame(predictions)
print(sub.info())
sub

In [152]:
sub.to_csv('submission.csv', index=False)