In [1]:
!nvidia-smi

Sun Sep 28 17:33:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   57C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
#  RAPIDS installation
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 603, done.[K
remote: Counting objects: 100% (169/169), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 603 (delta 131), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (603/603), 199.38 KiB | 446.00 KiB/s, done.
Resolving deltas: 100% (305/305), done.
Installing RAPIDS remaining 25.08 libraries
Using Python 3.12.11 environment at: /usr
Resolved 180 packages in 12.62s
Prepared 41 packages in 49.22s
Uninstalled 31 packages in 1.08s
Installed 41 packages in 393ms
 - bokeh==3.7.3
 + bokeh==3.6.3
 + cucim-cu12==25.8.0
 + cuda-bindings==12.9.2
 + cuda-pathfinder==1.2.3
 - cuda-python==12.6.2.post1
 + cuda-python==12.9.2
 - cudf-cu12==25.6.0 (from https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.6.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl)
 + cudf-cu12==25.8.0
 + cugraph-cu12==25.8.0
 - cuml-cu12==25.6.0
 + cuml-cu12==25.8.0
 - cuvs-cu12==25.6.1
 + cuvs-cu1

In [105]:
import cudf
import cuml
import numpy as np
import pandas as pd
import os
import glob


## simple test for the rapids

In [1]:
#just testing

df = cudf.DataFrame({
    'x': np.random.randn(1000),
    'y': np.random.randn(1000)
})

print(df.head())
print(f"Memory usage: {df.memory_usage().sum()} bytes")

# cuML for machine learning
from cuml.linear_model import LinearRegression

X = cudf.DataFrame({'feature': np.random.randn(1000)})
y = cudf.Series(np.random.randn(1000))

model = LinearRegression()
model.fit(X, y)
predictions = model.predict(X)

          x         y
0 -0.457082 -1.042492
1 -0.728086  0.907995
2  0.248947 -0.990382
3 -0.275262  0.606827
4 -0.351720  1.266213
Memory usage: 16000 bytes


  ret = func(*args, **kwargs)


## lets get to work !
### first we have to choose an adequate dataset , airline dataset is what is mostly used by rapids users , so we will work with it

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("iamsouravbanerjee/airline-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/iamsouravbanerjee/airline-dataset?dataset_version_number=4...


100%|██████████| 12.5M/12.5M [00:01<00:00, 8.09MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/iamsouravbanerjee/airline-dataset/versions/4


In [43]:
df = cudf.read_csv(os.path.join(path, "Airline Dataset Updated - v2.csv"))

csv_files = glob.glob(os.path.join(path, "*.csv"))
print("CSV files found:", csv_files)

if csv_files:
    df = cudf.read_csv(csv_files[0])
    print("Dataset loaded successfully!")
    print(f"Shape: {df.shape}")


CSV files found: ['/root/.cache/kagglehub/datasets/iamsouravbanerjee/airline-dataset/versions/4/Airline Dataset Updated.csv', '/root/.cache/kagglehub/datasets/iamsouravbanerjee/airline-dataset/versions/4/Airline Dataset.csv', '/root/.cache/kagglehub/datasets/iamsouravbanerjee/airline-dataset/versions/4/Airline Dataset Updated - v2.csv']
Dataset loaded successfully!
Shape: (98619, 15)


In [44]:
print(df.head())

  Passenger ID First Name Last Name  Gender  Age Nationality  \
0       ABVWIg     Edithe    Leggis  Female   62       Japan   
1       jkXXAX     Elwood      Catt    Male   62   Nicaragua   
2       CdUz2g      Darby   Felgate    Male   67      Russia   
3       BRS38V   Dominica      Pyle  Female   71       China   
4       9kvTLo        Bay   Pencost    Male   21       China   

                Airport Name Airport Country Code   Country Name  \
0           Coldfoot Airport                   US  United States   
1          Kugluktuk Airport                   CA         Canada   
2     Grenoble-Isère Airport                   FR         France   
3  Ottawa / Gatineau Airport                   CA         Canada   
4            Gillespie Field                   US  United States   

  Airport Continent     Continents Departure Date Arrival Airport  \
0               NAM  North America      6/28/2022             CXF   
1               NAM  North America     12/26/2022             YCO   

In [45]:
df.shape

(98619, 15)

we have almost 10 000 lignes  , with 15 rows

In [46]:
df.columns

Index(['Passenger ID', 'First Name', 'Last Name', 'Gender', 'Age',
       'Nationality', 'Airport Name', 'Airport Country Code', 'Country Name',
       'Airport Continent', 'Continents', 'Departure Date', 'Arrival Airport',
       'Pilot Name', 'Flight Status'],
      dtype='object')

In [47]:
df.dtypes

Unnamed: 0,0
Passenger ID,object
First Name,object
Last Name,object
Gender,object
Age,int64
Nationality,object
Airport Name,object
Airport Country Code,object
Country Name,object
Airport Continent,object


for the column of Departure date , which is obviously a date , with rapids it has been read as object/string so we would have to change it up later

In [48]:
#we check for null values
df.isnull().sum()

Passenger ID             0
First Name               0
Last Name                0
Gender                   0
Age                      0
Nationality              0
Airport Name             0
Airport Country Code     0
Country Name             0
Airport Continent        0
Continents               0
Departure Date           0
Arrival Airport         10
Pilot Name               0
Flight Status            0
dtype: int64

In [49]:
#we check for duplicates
df.duplicated().sum()

np.int64(0)

In [67]:
df.drop(columns=['Pilot Name','Passenger ID','First Name','Last Name'],inplace=True)

### Missing Values
since we have a single column that deals with missing values , which is the arrival airport, we can just drop those rows orrrrrrrr check if they are related to cancelled fligths

In [68]:
df[df['Arrival Airport'].isnull()==True]

Unnamed: 0,Gender,Age,Nationality,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Departure Date,Arrival Airport,Flight Status,Departure_DateTime,Departure_DayOfWeek,Departure_Month,Departure_Day,Departure_Hour,Is_Weekend,Is_HolidaySeason
23265,0,35,Madagascar,Narrabri Airport,AU,Australia,4,4,9/10/2022,,Cancelled,2022-09-10,5,9,10,0,1,0
27941,0,38,Philippines,Narrabri Airport,AU,Australia,4,4,11/27/2022,,Cancelled,2022-11-27,6,11,27,0,1,1
38671,1,74,Peru,Narrabri Airport,AU,Australia,4,4,1/5/2022,,Cancelled,2022-01-05,2,1,5,0,0,0
41234,1,74,Croatia,Narrabri Airport,AU,Australia,4,4,1/15/2022,,Cancelled,2022-01-15,5,1,15,0,1,0
93895,1,33,Croatia,Narrabri Airport,AU,Australia,4,4,9/10/2022,,Cancelled,2022-09-10,5,9,10,0,1,0


In [69]:
#here what we can do is keep the ones with cancelled , but drop the otehrs
issue_rows = df[(df['Arrival Airport'].isnull()) & (df['Flight Status'] != 'Cancelled') &(df['Flight Status'] != 'Diverted')  ]
df.drop(issue_rows.index,inplace=True)

In [70]:
len(df)

98614

In [71]:
#just 5 rows but it is theidea that matters :)

### Data Types

In [72]:
#df['Departure Date'] = cudf.to_datetime(df['Departure Date'])
"""
the first methode directly will cause an error cause cudf does not work when the datetime have different formats , the easiest solutions is to convert to pandas then reconvert to cudf
"""
df_pandas = df.to_pandas()
df_pandas['Departure_DateTime'] = pd.to_datetime(df_pandas['Departure Date'], errors='coerce')
df = cudf.from_pandas(df_pandas)


## do some magic of gpu to cpu happen in the shadows when we convert from pandas to cudf
kjlk

In [73]:
# Date components
df['Departure_DayOfWeek'] = df['Departure_DateTime'].dt.dayofweek  # 0=Monday, 6=Sunday
df['Departure_Month'] = df['Departure_DateTime'].dt.month
df['Departure_Day'] = df['Departure_DateTime'].dt.day
df['Departure_Hour'] = df['Departure_DateTime'].dt.hour

# Derived features
df['Is_Weekend'] = (df['Departure_DayOfWeek'] >= 5).astype(int)  # 1 if Sat/Sun
df['Is_HolidaySeason'] = df['Departure_Month'].isin([11, 12]).astype(int)  # Nov/Dec holidays
#df['TimeOfDay'] = df['Departure_Hour'].apply(lambda x: 'Morning' if x < 12 else 'Afternoon' if x < 18 else 'Evening')

In [74]:
#map doesnt work with cudf
# mm

In [80]:
df.drop(columns=['Departure Date'],inplace=True)

In [97]:
df['Gender'] = df['Gender'].astype('category').cat.codes
df['Airport Continent'] = df['Airport Continent'].astype('category').cat.codes
df['Continents'] = df['Continents'].astype('category').cat.codes
df['Flight Status'] = df['Flight Status'].astype('category').cat.codes

#we encode the ones with not many values

In [82]:
df['Gender']

0        0
1        1
2        1
3        0
4        1
        ..
98614    1
98615    0
98616    1
98617    0
98618    0
Name: Gender, Length: 98614, dtype: uint8

In [99]:
# Example approach
print(df['Arrival Airport'].value_counts)

<bound method Series.value_counts of 0        CXF
1        YCO
2        GNB
3        YND
4        SEE
        ... 
98614    HAA
98615    IVA
98616    ABC
98617    GGN
98618    JOK
Name: Arrival Airport, Length: 98614, dtype: object>


In [96]:
len(df['Flight Status'].unique())

3

In [85]:
df.columns

Index(['Gender', 'Age', 'Nationality', 'Airport Name', 'Airport Country Code',
       'Country Name', 'Airport Continent', 'Continents', 'Arrival Airport',
       'Flight Status', 'Departure_DateTime', 'Departure_DayOfWeek',
       'Departure_Month', 'Departure_Day', 'Departure_Hour', 'Is_Weekend',
       'Is_HolidaySeason'],
      dtype='object')

In [102]:
#for values with high cardinality we will do frequency encoding
high_cardinality_cols=['Nationality','Airport Name','Airport Country Code','Country Name','Arrival Airport']
for col in high_cardinality_cols:
    # Calculate frequencies
    freq_map = df[col].value_counts()

    df[f'{col}_freq'] = df[col].map(freq_map)

In [103]:
df.head()

Unnamed: 0,Gender,Age,Nationality,Airport Name,Airport Country Code,Country Name,Airport Continent,Continents,Arrival Airport,Flight Status,...,Departure_Month,Departure_Day,Departure_Hour,Is_Weekend,Is_HolidaySeason,Nationality_freq,Airport Name_freq,Airport Country Code_freq,Country Name_freq,Arrival Airport_freq
0,0,62,Japan,Coldfoot Airport,US,United States,3,3,CXF,2,...,6,28,0,0,0,1805,11,22104,22104,11
1,1,62,Nicaragua,Kugluktuk Airport,CA,Canada,3,3,YCO,2,...,12,26,0,0,1,203,9,5424,5424,9
2,1,67,Russia,Grenoble-Isère Airport,FR,France,2,2,GNB,2,...,1,18,0,0,0,5693,15,1382,1382,15
3,0,71,China,Ottawa / Gatineau Airport,CA,Canada,3,3,YND,1,...,9,16,0,0,0,18316,7,5424,5424,7
4,1,21,China,Gillespie Field,US,United States,3,3,SEE,2,...,2,25,0,0,0,18316,11,22104,22104,11


# Exploratory Data Analysis (EDA)