![](https://miro.medium.com/max/4096/1*dNJ0fhCPqYsrLeSLCjswig.png)

RAPIDS uses optimized **NVIDIA CUDA®** primitives and high-bandwidth GPU memory to accelerate data preparation and machine learning. The goal of RAPIDS is not only to accelerate the individual parts of the typical data science workflow, but to accelerate the complete end-to-end workflow. 

In this project, after taking and processing 2.9+ GB of data, we will try to make an inference with linear regression.

In [None]:
# GPU check
!nvidia-smi

In [None]:
# Rapids install
!pip3 install rapids
!pip3 install cuml

In [None]:
# import modlue (Rapids)
from cuml import LinearRegression
import cudf, cupy, cuml

import matplotlib.pyplot as plt
import numpy as np
import matplotlib
import re

In [None]:
%%time
# DataFrame (GPU)
data = cudf.read_csv("/kaggle/input/uk-housing-prices-paid/price_paid_records.csv")
data.info()

In [None]:
# First five data
data.head()

In [None]:
# Average price of our homes
data['Price'].mean()

In [None]:
# I'm removing the unnecessary attribute.
%time data.drop(['Transaction unique identifier'], axis='columns', inplace=True)

In [None]:
# First five data
data.head()

In [None]:
# "Property Type" visualization
name = data['Property Type'].factorize()[1].to_arrow()
data_pie = []
c=float(0)
for i in name:
    
    data_pie.append(cupy.asnumpy(data['Property Type'][data['Property Type'].factorize()[0]==c].count()))
    c+=1

matplotlib.pyplot.title("Property Type", fontdict=None, loc='center', pad=None)
plt.pie(data_pie, labels=name, autopct='%0.f%%', startangle=90)

In [None]:
# "Old/New" visualization
name = data['Old/New'].factorize()[1].to_arrow()
data_pie = []
c=float(0)
for i in name:
    
    data_pie.append(cupy.asnumpy(data['Old/New'][data['Old/New'].factorize()[0]==c].count()))
    c+=1

matplotlib.pyplot.title("Old/New", fontdict=None, loc='center', pad=None)
plt.pie(data_pie, labels=name, autopct='%0.f%%', startangle=90)

In [None]:
# "Duration" visualization
name = data['Duration'].factorize()[1].to_arrow()
data_pie = []
c=float(0)
for i in name:
    
    data_pie.append(cupy.asnumpy(data['Duration'][data['Duration'].factorize()[0]==c].count()))
    c+=1

matplotlib.pyplot.title("Duration", fontdict=None, loc='center', pad=None)
plt.pie(data_pie, labels=name, autopct='%0.f%%', startangle=90)

In [None]:
# "PPDCategory Type" visualization
name = data['PPDCategory Type'].factorize()[1].to_arrow()
data_pie = []
c=float(0)
for i in name:
    
    data_pie.append(cupy.asnumpy(data['PPDCategory Type'][data['PPDCategory Type'].factorize()[0]==c].count()))
    c+=1

matplotlib.pyplot.title("PPDCategory Type", fontdict=None, loc='center', pad=None)
plt.pie(data_pie, labels=name, autopct='%0.f%%', startangle=90)

In [None]:
# "Record Status - monthly file only" visualization
# I am of the opinion that this column is unnecessary, so I will not buy this column in the train phase.

name = data['Record Status - monthly file only'].factorize()[1].to_arrow()
data_pie = []
c=float(0)
for i in name:
    
    data_pie.append(cupy.asnumpy(data['Record Status - monthly file only'][data['Record Status - monthly file only'].factorize()[0]==c].count()))
    c+=1

matplotlib.pyplot.title("Record Status - monthly file only", fontdict=None, loc='center', pad=None)
plt.pie(data_pie, labels=name, autopct='%0.f%%', startangle=90)

In [None]:
# We translate our column named "Date of Transfer" in a way that the model can understand.
# dt_name is the name of the column that is of type datepart

def add_datepart(df, dt_name, drop=True, time=False):
    "Creates new columns from our datetime column"
    
    
    dt_column = df[dt_name]
    column_dtype = dt_column.dtype
    

    targ_name = re.sub('[Dd]ate$', '', dt_name)
    
    # attributes are normally in lower case but we wrote this way because we will use it in columns' name too
    attr = ['Year', 'Month', 'Day']
    
    if time: 
        attr = attr + ['Hour', 'Minute', 'Second']
        
    
    #Sorry curse of dimensionality, maybe another time
    for a in attr: 
        df[targ_name + a] = getattr(dt_column.dt, a.lower())
        
    # how much time passed, we will divide by 10^9 because it is in the nanosecond format
    df[targ_name + 'Elapsed'] = dt_column.astype(cupy.int64) // 10 ** 9
    
    if drop: 
        df.drop(dt_name, axis=1, inplace=True)

In [None]:
# We convert the column data type to datetime64 [ns] type.
data['Date of Transfer']=data['Date of Transfer'].astype('datetime64[ns]')
add_datepart(data, 'Date of Transfer', drop=True, time=False)

In [None]:
# new data
data.head()

In [None]:
# We convert data from non-numeric to numeric, because our model cannot infer from string data.
print(data['Old/New'].factorize()[1])
print("\n\n",data['Old/New'].factorize()[0])

In [None]:
print(data['Town/City'].factorize()[1])
print("\n\n",data['Town/City'].factorize()[0])

In [None]:
%%time
# numeric
data['Property Type']=data['Property Type'].factorize()[0].astype('float32')
data['Old/New']=data['Old/New'].factorize()[0].astype('float32')
data['Duration']=data['Duration'].factorize()[0].astype('float32')
data['Town/City']=data['Town/City'].factorize()[0].astype('float32')
data['District']=data['District'].factorize()[0].astype('float32')
data['County']=data['County'].factorize()[0].astype('float32')
data['PPDCategory Type']=data['PPDCategory Type'].factorize()[0].astype('float32')
data['Record Status - monthly file only']=data['Record Status - monthly file only'].factorize()[0].astype('float32')

data['Price'] = data['Price'].astype('float32')

In [None]:
# data info
data.info()

In [None]:
# new data
# First five data
data.head()

In [None]:
data.shape

In [None]:
# I divide the data into "test" and "train".
tempX = data[['Property Type', 'Old/New', 'Duration', 'Town/City', 'District', 'County', 'PPDCategory Type', 'Date of TransferYear', 'Date of TransferMonth', 'Date of TransferDay', 'Date of TransferElapsed']]
tempY = data[['Price']]

x_train, x_test, y_train, y_test  = cuml.train_test_split(tempX, tempY, train_size=0.8)

In [None]:
x_train.head()

In [None]:
y_train.head()

In [None]:
# Linear regression on the GPU.
lr = LinearRegression()
lr.fit(x_train, y_train)

In [None]:
# Predict
pred = cudf.DataFrame()
temp =lr.predict(x_test)
pred["Pred"] = temp
pred["Pred"] = pred["Pred"].astype('float64')
pred

In [None]:
# Label
y_test["Price"] = y_test["Price"].astype('float64')
y_test

In [None]:
# MSE
cuml.metrics.regression.mean_squared_error(y_test, pred)

In [None]:
# RMSE
cuml.metrics.regression.mean_squared_error(y_test, pred, squared=False)

In [None]:
# R^2
cuml.metrics.r2_score(y_test, pred)