<a href="https://www.kaggle.com/code/shiyamaladevirs/final-obdtrackerid?scriptVersionId=226979384" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/demodataset/Week 10 NikhilkrishnanOBD1.xlsx
/kaggle/input/demodataset/Copy of OBD Sensitive Verticals _ A (1) (1).xlsx


**PLEASE USE GPU T4 x2 For FASTER RESULTS**

In [2]:
!pip install XlsxWriter

import time
import os
import re
import csv
import pandas as pd
import numpy as np

# Try to import cuDF for GPU acceleration; if not available, fall back to Dask.
try:
    import cudf
    gpu_available = True
    print("GPU acceleration available: Using RAPIDS cuDF")
except ImportError:
    gpu_available = False
    import dask.dataframe as dd
    print("GPU acceleration not available: Using Dask")

def load_main_file(main_file, npartitions=4):
    """
    Load the main file (CSV or XLSX) as a DataFrame.
    Uses cuDF if GPU is available; otherwise, uses Dask.
    """
    ext = os.path.splitext(main_file)[1].lower()
    if gpu_available:
        if ext == '.csv':
            print("Reading main CSV file with cuDF on GPU...")
            df = cudf.read_csv(main_file)
        elif ext in ['.xlsx', '.xls']:
            print("Reading main XLSX file with pandas and converting to cuDF on GPU...")
            df_pd = pd.read_excel(main_file, engine='openpyxl')
            # Convert all object columns to string to avoid MixedTypeError
            for col in df_pd.select_dtypes(include=['object']).columns:
                df_pd[col] = df_pd[col].astype(str)
            df = cudf.DataFrame.from_pandas(df_pd)
        else:
            raise ValueError(f"Unsupported file format: {ext}")
    else:
        if ext == '.csv':
            print("Reading main CSV file with Dask...")
            df = dd.read_csv(
                main_file,
                engine='python',
                sep=',',
                quoting=csv.QUOTE_NONE,
                escapechar='\\',
                on_bad_lines='skip',
                encoding='latin1',
                assume_missing=True,
                blocksize="100MB"
            )
        elif ext in ['.xlsx', '.xls']:
            print("Reading main XLSX file with pandas and converting to Dask DataFrame...")
            df_pd = pd.read_excel(main_file, engine='openpyxl')
            df = dd.from_pandas(df_pd, npartitions=npartitions)
        else:
            raise ValueError(f"Unsupported file format: {ext}")
    return df

def load_obd_sensitive(obd_file):
    """
    Load OBD sensitive verticals from an Excel file.
    If the 'vertical' column is missing, the first column is used.
    Returns a set of normalized sensitive keywords.
    """
    obd_df = pd.read_excel(obd_file, engine='openpyxl')
    if 'vertical' not in obd_df.columns:
        print("Column 'vertical' not found in OBD file. Using the first column as 'vertical'.")
        first_col = obd_df.columns[0]
        obd_df['vertical'] = obd_df[first_col]
    obd_df['vertical'] = obd_df['vertical'].astype(str).str.strip().str.lower()
    return set(obd_df['vertical'].dropna())

def ensure_columns(pdf):
    """
    Ensure required columns exist in each partition.
    """
    if 'hub_zone' not in pdf.columns:
        pdf['hub_zone'] = "unknown"
    return pdf

def process_dataframe(df, combined_sensitive):
    """
    Process the dataframe while preserving original text case.
    For filtering, a temporary lower-case combined column is created,
    then dropped so that original capitalization is retained.
    """
    # Strip whitespace from column names but preserve original case
    df.columns = df.columns.str.strip()
    
    # For Dask, ensure every partition has the necessary columns.
    if not gpu_available:
        df = df.map_partitions(ensure_columns)
    
    # Strip whitespace for key text columns (without converting to lower-case)
    text_cols = ['brand', 'product_title', 'vertical', 'hub_name', 'parent_lm_hub', 'hub_zone']
    for col in text_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()
    
    if not gpu_available:
        df = df.reset_index(drop=True)
    
    total_rows_before = len(df) if gpu_available else df.shape[0].compute()
    print("Total rows before filtering sensitive content:", total_rows_before)
    
    # For filtering, create a temporary combined column with lower-case conversion
    required_cols = ['brand', 'product_title', 'vertical']
    if all(c in df.columns for c in required_cols):
        df['temp_combined'] = (df['brand'].fillna('') + " " + 
                                 df['product_title'].fillna('') + " " + 
                                 df['vertical'].fillna('')).str.lower().str.strip()
        
        pattern = '|'.join([re.escape(word) for word in combined_sensitive])
        if gpu_available:
            mask = ~df['temp_combined'].str.contains(pattern)
            df = df[mask]
        else:
            df = df[~df['temp_combined'].str.contains(pattern, case=False, na=False)]
        df = df.drop(columns=['temp_combined'])
    
    filtered_count = len(df) if gpu_available else df.shape[0].compute()
    print("Rows after filtering sensitive content:", filtered_count)
    
    # Fill missing 'parent_lm_hub' with 'hub_name' (preserving original case)
    if 'hub_name' in df.columns and 'parent_lm_hub' in df.columns:
        df['parent_lm_hub'] = df['parent_lm_hub'].replace(
            {"": np.nan, "unknown": np.nan, "null": np.nan, "nan": np.nan, "n/a": np.nan}
        ).fillna(df['hub_name'])
    
    # Fill missing 'hub_zone' values within each 'hub_name' group.
    def fill_hub_zone(series):
        valid = series[~series.isin(["", "unknown", "null", "nan", "n/a"])]
        fill_value = valid.iloc[0] if not valid.empty else "unknown"
        return series.fillna(fill_value).replace(["", "unknown", "null", "nan", "n/a"], fill_value)
    
    if 'hub_name' in df.columns and 'hub_zone' in df.columns:
        if gpu_available:
            df_pd = df.to_pandas()
            df_pd['hub_zone'] = df_pd.groupby('hub_name')['hub_zone'].transform(fill_hub_zone)
            df = cudf.DataFrame.from_pandas(df_pd)
        else:
            df = df.shuffle(on="hub_name")
            df['hub_zone'] = df.groupby('hub_name')['hub_zone'].transform(fill_hub_zone)
    
    return df

def sample_per_hub(df, n=10):
    """
    Sample up to n rows per 'hub_name' group.
    """
    def sample_group(pdf, n=n):
        return pdf.sample(n=n, random_state=42) if len(pdf) > n else pdf
    if 'hub_name' in df.columns:
        if gpu_available:
            df_pd = df.to_pandas()
            sampled_pd = df_pd.groupby('hub_name').apply(lambda x: sample_group(x, n)).reset_index(drop=True)
            return sampled_pd
        else:
            sampled_ddf = df.groupby('hub_name').apply(sample_group, meta=df._meta)
            final_df = sampled_ddf.compute()
            return final_df
    else:
        if gpu_available:
            return df.to_pandas()
        else:
            return df.compute()

def main(main_file, obd_file, output_file):
    overall_start = time.time()
    
    start = time.time()
    obd_verticals = load_obd_sensitive(obd_file)
    manual_sensitive = {'pampers', 'diaper'}  # Additional keywords.
    combined_sensitive = obd_verticals.union(manual_sensitive)
    print("Combined sensitive keywords:", combined_sensitive)
    print("Time to load OBD sensitive file: {:.2f} seconds".format(time.time() - start))
    
    start = time.time()
    df = load_main_file(main_file)
    print("Time to load main file: {:.2f} seconds".format(time.time() - start))
    
    start = time.time()
    df = process_dataframe(df, combined_sensitive)
    print("Time to process dataframe: {:.2f} seconds".format(time.time() - start))
    
    start = time.time()
    final_df = sample_per_hub(df, n=10)
    print("Time to sample per hub: {:.2f} seconds".format(time.time() - start))
    
    start = time.time()
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        final_df.to_excel(writer, sheet_name="TrackingIDs", index=False)
        obd_df = pd.read_excel(obd_file, engine='openpyxl')
        obd_df.to_excel(writer, sheet_name="OBD Sensitive", index=False)
    print("Time to write Excel: {:.2f} seconds".format(time.time() - start))
    
    overall_time = time.time() - overall_start
    print("Total time taken: {:.2f} seconds".format(overall_time))
    print("Done. Output written to", output_file)

if __name__ == '__main__':
    MAIN_FILE = "/kaggle/input/demodataset/Week 10 NikhilkrishnanOBD1.xlsx"  # Give THE Path of input file
    OBD_FILE = "/kaggle/input/demodataset/Copy of OBD Sensitive Verticals _ A (1) (1).xlsx" # Give THE Path of OBD Sensitive file
    OUTPUT_FILE = "/kaggle/working/processed_data.xlsx"
    main(MAIN_FILE, OBD_FILE, OUTPUT_FILE)

Collecting XlsxWriter
  Downloading XlsxWriter-3.2.2-py3-none-any.whl.metadata (2.8 kB)
Downloading XlsxWriter-3.2.2-py3-none-any.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.1/165.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: XlsxWriter
Successfully installed XlsxWriter-3.2.2
GPU acceleration available: Using RAPIDS cuDF
Column 'vertical' not found in OBD file. Using the first column as 'vertical'.
Combined sensitive keywords: {'tampon', 'womensportbra', 'babybooty', 'girlblouse', 'femaleurinationdevice', 'girlbodysuit', 'maledisorders', 'menstrualcups', 'infantbodysuit', 'roleplaytoy', 'girlswimsuit', 'mensvest', 'adultdiapers', 'pleasureenhancement', 'fertilitykit', 'mentalwellnessproducts', 'condom', 'diaper', 'womenblouse', 'mensboxer', 'girlsleepwear', 'sexualcomboandkit', 'mensbrief', 'breastpump', 'womenboxer', 'infantsleepwear', 'womenpanty', 'femaledisorders', 'mensswimsuit', 'girlinnerwear', 'preg

  sampled_pd = df_pd.groupby('hub_name').apply(lambda x: sample_group(x, n)).reset_index(drop=True)


Time to sample per hub: 5.74 seconds
Time to write Excel: 17.12 seconds
Total time taken: 254.78 seconds
Done. Output written to /kaggle/working/processed_data.xlsx
