In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#importing required libraries
import cudf
import cupy as cp
import cuml
from cuml import LinearRegression
from cuml.linear_model import LinearRegression
from cuml.model_selection import train_test_split

In [3]:
#loading train and test datasets
traindf = cudf.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
testdf = cudf.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [4]:
traindf.head()

In [5]:
testdf.head()

In [6]:
y=traindf['SalePrice']
traindf.drop('SalePrice',axis=1,inplace=True)

In [7]:
ntrain,ntest=traindf.shape[0],testdf.shape[0]
print(traindf.shape)
print(testdf.shape)

In [8]:
#visualizing the data to check for null values
import seaborn as sns
trainvis=traindf
trainvis=trainvis.to_pandas()
sns.heatmap(trainvis.isnull())

In [9]:
#null values can also be checked as
traindf.isnull().any().any()

In [10]:
d=cudf.concat([traindf,testdf],axis=0)
d.shape

In [11]:
# dropping columns having more than 20% missing values
todrop=[] 
categoricalfeatures=[]
for c in d.keys():
    if d[c].dtype=='object':
        categoricalfeatures.append(c)
    if d[c].isnull().sum()>=len(traindf)//5:
        todrop.append(c)
d=d.drop(todrop,axis=1)

In [12]:
for c in d.keys():
    if d[c].isnull().sum()!=0:
        if c in categoricalfeatures:
            d[c].fillna(d[c].mode()[0],inplace=True)
        else:
            d[c].fillna(d[c].mean(),inplace=True)

In [13]:
#checking the visualization to see if there still are null values or not
d1=d
d1=d1.to_pandas()
sns.heatmap(d1.isnull())

In [14]:
#this also can be checked without the visualization
d.isnull().any().any()

In [15]:
# encode categorical variables
le=cuml.preprocessing.LabelEncoder()
for n in d.columns:
    if d[n].dtype=='O':
        d[n]=d[n].astype(str)
        le.fit(d[n])
        d[n]=le.transform(d[n])    

In [16]:
d=cudf.get_dummies(d)
traindf=d.iloc[:ntrain,:]
testdf=d.iloc[ntest+1:,:]
print(traindf.shape)
print(testdf.shape)

In [17]:
#creating the model
x=traindf
x_train,x_test,y_train,y_test=cuml.train_test_split(x,y,test_size=0.3,random_state=42)

In [18]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [19]:
algorithm = ['svd', 'eig', 'svd-qr', 'qr','svd-jacobi']
for i in algorithm:
    print("Algorithm:",i)
    lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = i)
    reg = lr.fit(x_train,y_train)
    preds = lr.predict(x_test)
    print("MSE:",cuml.metrics.regression.mean_squared_error(y_test.astype('int64'),preds.astype('int64')))
    print("R2 Score:",cuml.metrics.regression.r2_score(y_test.astype('float32'),preds.astype('float32')))
    print("MAE:",cuml.metrics.regression.mean_absolute_error(y_test.astype('int64'),preds.astype('int64')))
    print()

In [20]:
model=cuml.LinearRegression(algorithm='eig')
trained=model.fit(x,y)
y_predict=trained.predict(testdf)

In [21]:
sample=cudf.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample.head()

In [22]:
sample['SalePrice']=y_predict
sample['SalePrice']

In [23]:
sample.to_csv('submission.csv', index=False)