In [1]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 476, done.[K
remote: Counting objects: 100% (207/207), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 476 (delta 141), reused 124 (delta 91), pack-reused 269[K
Receiving objects: 100% (476/476), 131.59 KiB | 1.07 MiB/s, done.
Resolving deltas: 100% (243/243), done.
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 1.9 MB/s eta 0:00:00
Installing collected packages: pynvml
Successfully installed pynvml-11.5.0
***********************************************************************
Woo! Your instance has a NVIDIA L4 GPU!
We will install the latest stable RAPIDS via pip 24.4.*!  Please stand by, should be quick...
***********************************************************************

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu12==24.4.*
  Downloading https://pypi.nvidia

In [3]:
# Critical imports
import cudf
import cuml
import os
import numpy as np
import pandas as pd

 ## Move Pandas dictionaries to GPU

In [4]:
# Create pandas dataframe
pandas_df = pd.DataFrame({
    'integers': [1, 2, 3, 4],
    'strings': ['a', 'b', 'c', 'd']
})

# Convert string column to category format
pandas_df['strings'] = pandas_df['strings'].astype('category')

# Bridge from pandas to cudf
gdf = cudf.DataFrame.from_pandas(pandas_df)

# Print dataframe
print(gdf)

   integers strings
0         1       a
1         2       b
2         3       c
3         4       d


#Create objects with cudf

In [5]:
### Create a Series of integers

gdf = cudf.Series([1, 2, 3, 4, 5, 6])
print(gdf)
print(type(gdf))

### Create a Series of floats

gdf = cudf.Series([1., 2., 3., 4., 5., 6.])
print(gdf)

### Create a  Series of strings


gdf = cudf.Series(['a', 'b', 'c'])
print(gdf)

### Create 3 column DataFrame

import datetime as dt

# Using a dictionary of key-value pairs
# Each key in the dictionary represents a category
# The key is the category's name
# The value is a list of the values in that category
gdf = cudf.DataFrame({
    # Create 10 busindates ess from 1st January 2019 via pandas
    'dates': pd.date_range('1/1/2019', periods=10, freq='B'),
    # Integers
    'integers': [i for i in range(10)],
    # Floats
    'floats': [float(i) for i in range(10)]
})

# Print dataframe
print(gdf)

### Create 2 column Dataframe

# Using a dictionary
# Each key in the dictionary represents a category
# The key is the category's name
# The value is a list of the values in that category
gdf = cudf.DataFrame({
    'integers': [1 ,2, 3, 4],
    'string': ['a', 'b', 'c', 'd']
})

print(gdf)












0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64
<class 'cudf.core.series.Series'>
0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
dtype: float64
0    a
1    b
2    c
dtype: object
       dates  integers  floats
0 2019-01-01         0     0.0
1 2019-01-02         1     1.0
2 2019-01-03         2     2.0
3 2019-01-04         3     3.0
4 2019-01-07         4     4.0
5 2019-01-08         5     5.0
6 2019-01-09         6     6.0
7 2019-01-10         7     7.0
8 2019-01-11         8     8.0
9 2019-01-14         9     9.0
   integers string
0         1      a
1         2      b
2         3      c
3         4      d


#Filtering a cudf dataframe

In [7]:
#Printing Column Names


gdf.columns


Index(['integers', 'string'], dtype='object')

In [8]:
### Viewing Top of DataFrame

num_of_rows_to_view = 2
print(gdf.head(num_of_rows_to_view))

### Viewing Bottom of DataFrame

num_of_rows_to_view = 3
print(gdf.tail(num_of_rows_to_view))

   integers string
0         1      a
1         2      b
   integers string
1         2      b
2         3      c
3         4      d


In [12]:

## Filtering

# Method 1: Query


print(gdf.query('integers == 1'))



print(gdf[gdf['string']=="a"])

   integers string
0         1      a
   integers string
0         1      a


In [14]:




# Method 2:  Simple Columns

# Filtering Strings by Column Values

#print(gdf[gdf.strings == 'b'])


# Filtering based on the string column
print(gdf[gdf.integers == 2])

# Method 3:  Simple Rows

#### Filtering by Row Numbers


# Filter rows 0 to 2 (not inclusive of the third row with the index 2)
print(gdf[0:2])

#Using .loc
# The syntax is as follows loc[rows, columns] allowing you to choose rows and columns accordingly
# The example allows us to filter the first 3 rows (inclusive) of the column integers
print(gdf.loc[0:2, ['integers']])

   integers string
1         2      b
   integers string
0         1      a
1         2      b
   integers
0         1
1         2
2         3


#Intro to cuml

In [15]:
import pickle
import cuml
from cuml.common.device_selection import using_device_type
from cuml.common.device_selection import set_global_device_type, get_global_device_type
from cuml.neighbors import NearestNeighbors
from cuml.manifold import UMAP
from cuml.linear_model import LinearRegression
from cuml.datasets import make_regression, make_blobs
from cuml.model_selection import train_test_split

X_blobs, y_blobs = make_blobs(n_samples=2000, n_features=20)
X_train_blobs, X_test_blobs, y_train_blobs, y_test_blobs = train_test_split(X_blobs, y_blobs, test_size=0.2, shuffle=True)

X_reg, y_reg = make_regression(n_samples=2000, n_features=20)
X_train_reg, X_test_reg, y_train_reg, y_tes_reg = train_test_split(X_reg, y_reg, test_size=0.2, shuffle=True)

nn = NearestNeighbors()
#with using_device_type('gpu'):
with using_device_type('cpu'):
    nn.fit(X_train_blobs)
    nearest_neighbors = nn.kneighbors(X_test_blobs)