# Module Demonstration Notebook

This notebook demonstrates how to use our custom module. It includes examples for:

- Data cleaning functions
- Device matching functions
- File search functions

Feel free to modify the examples as needed.


In [2]:
import os
import pandas as pd
import numpy as np
from rapidfuzz import fuzz, process
from user_agents import parse

# Import functions from your module (adjust the module/package name accordingly)
from deviceMatching_V1 import (
    identify_categorical_columns,
    proccess_online_web,
    log_data_clean,
    procces_online_app,
    process_hifpt,
    proccess_telecom,
    load_sentence_transformer,
    precompute_column_embeddings,
    match_all_queries,
    filter_by_brand,
    filter_by_model_exact,
    find_file,
    find_parquet_file
)


  from .autonotebook import tqdm as notebook_tqdm


Successfully imported Device Matching Module


## 1. Data Cleaning Functions

In [4]:
# Create a dummy DataFrame for log_data_clean demonstration
df = pd.DataFrame({
    'col1': ['HELLO', None, 'WORLD'],
    'col2': ['foo', 'BAR', None],
    'user_agent': ['Chrome', None, 'Firefox'],
    'mac': [None, '00:1A:2B:3C:4D:5E', 'AA:BB:CC:DD:EE:FF']
})

print("Original DataFrame:")
print(df)

# Test log_data_clean function
df_clean = log_data_clean(df)
print("\nCleaned DataFrame using log_data_clean:")
print(df_clean)

# Create a dummy DataFrame for proccess_online_web demonstration
df_online_web = pd.DataFrame({
    "cdp_id": [1, 2],
    "user_id": [None, "1"],
    "device_brand": ["Samsung", None],
    "device_model": ["Galaxy", "Note"],
    "device_type": ["phone", "tablet"],
    "os": ["Android", "iOS"],
    "os_version": ["10", "14"],
    "user_agent": [None, "Mozilla"],
    "context_device_model": ["SM-G950F", "iPhone"],
    "marketing_name": [None, "Galaxy S10"]
})

df_online_clean = proccess_online_web(df_online_web)
print("\nCleaned Online Web Data using proccess_online_web:")
print(df_online_clean)


Original DataFrame:
    col1  col2 user_agent                mac
0  HELLO   foo     Chrome               None
1   None   BAR       None  00:1A:2B:3C:4D:5E
2  WORLD  None    Firefox  AA:BB:CC:DD:EE:FF

Cleaned DataFrame using log_data_clean:
      col1     col2 user_agent                mac
0    hello      foo     Chrome            unknown
1  unknown      bar    unknown  00:1A:2B:3C:4D:5E
2    world  unknown    Firefox  AA:BB:CC:DD:EE:FF

Cleaned Online Web Data using proccess_online_web:
   cdp_id user_id device_brand device_model device_type       os os_version  \
0       1      -1      samsung       galaxy       phone  android         10   
1       2       1      unknown         note      tablet      ios         14   

  user_agent context_device_model marketing_name  
0    unknown             sm-g950f        unknown  
1    Mozilla               iphone     galaxy s10  


## 2. Device Matching Functions

In [5]:
# Create a dummy device dictionary DataFrame for brand matching
df_brand = pd.DataFrame({
    "Brand": ["Apple", "Samsung", "Google", "OnePlus"]
})

# Test filter_by_brand with an exact match
log_brand_exact = "Samsung"
result_exact = filter_by_brand(log_brand_exact, df_brand)
print("Exact match for 'Samsung' in filter_by_brand:")
print(result_exact)

# Test filter_by_brand with a fuzzy match (intentional typo)
log_brand_fuzzy = "samsng"
result_fuzzy = filter_by_brand(log_brand_fuzzy, df_brand)
print("\nFuzzy match for 'samsng' in filter_by_brand:")
print(result_fuzzy)

# Test filter_by_brand with no match
log_brand_none = "Nokia"
result_none = filter_by_brand(log_brand_none, df_brand)
print("\nNo match for 'Nokia' in filter_by_brand (should be empty):")
print(result_none)

# Create a dummy device dictionary DataFrame for model matching
df_model = pd.DataFrame({
    "Model Name": ["iPhone 12", "Galaxy S21", "Pixel 5", "Nord"],
    "Models": ["A2172, A2402", "SM-G991B", "GD1YQ", "A series"]
})

# Test filter_by_model_exact with an exact match on Model Name
log_model = "Galaxy S21"
log_marketing_name = "unknown"
result_model_exact = filter_by_model_exact(log_model, log_marketing_name, df_model)
print("\nExact match for 'Galaxy S21' in filter_by_model_exact:")
print(result_model_exact)

# Test filter_by_model_exact with both values as 'unknown'
result_model_unknown = filter_by_model_exact("unknown", "unknown", df_model)
print("\nNo match when both model and marketing name are 'unknown' in filter_by_model_exact:")
print(result_model_unknown)

# Test filter_by_model_exact using a substring that appears in the 'Models' column
log_model_contains = "GD1YQ"
result_model_contains = filter_by_model_exact(log_model_contains, "unknown", df_model)
print("\nContains match for 'GD1YQ' in filter_by_model_exact:")
print(result_model_contains)


Exact match for 'Samsung' in filter_by_brand:
     Brand
1  Samsung

Fuzzy match for 'samsng' in filter_by_brand:
     Brand
1  Samsung

No match for 'Nokia' in filter_by_brand (should be empty):
Empty DataFrame
Columns: []
Index: []

Exact match for 'Galaxy S21' in filter_by_model_exact:
{'matched_model_name': None, 'matched_models': None, 'source': 'No Match'}

No match when both model and marketing name are 'unknown' in filter_by_model_exact:
{'matched_model_name': None, 'matched_models': None, 'source': 'No Match'}

Contains match for 'GD1YQ' in filter_by_model_exact:
{'matched_model_name': None, 'matched_models': None, 'source': 'No Match'}


In [None]:
## 3. File Search Functions


In [6]:
import tempfile

# Test find_file function using a temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:
    test_file_name = "test.txt"
    test_file_path = os.path.join(tmp_dir, test_file_name)
    with open(test_file_path, "w") as f:
        f.write("dummy content")
    
    found_path = find_file(test_file_name, tmp_dir)
    print("find_file result (should find test.txt):")
    print(found_path)
    
    not_found = find_file("nonexistent.txt", tmp_dir)
    print("\nfind_file result for 'nonexistent.txt' (should be None):")
    print(not_found)

# Test find_parquet_file function using a temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:
    # Create a directory named "data.parquet"
    parquet_dir_name = "data.parquet"
    parquet_dir_path = os.path.join(tmp_dir, parquet_dir_name)
    os.mkdir(parquet_dir_path)

    # Create a subdirectory with a dummy Parquet file
    sub_dir = os.path.join(tmp_dir, "subfolder")
    os.mkdir(sub_dir)
    parquet_file_name = "datafile.parquet"
    parquet_file_path = os.path.join(sub_dir, parquet_file_name)
    with open(parquet_file_path, "w") as f:
        f.write("dummy parquet content")

    found_dir = find_parquet_file(parquet_dir_name, tmp_dir)
    print("\nfind_parquet_file result (should find directory 'data.parquet'):")
    print(found_dir)
    
    found_file = find_parquet_file(parquet_file_name, tmp_dir)
    print("\nfind_parquet_file result (should find file 'datafile.parquet'):")
    print(found_file)
    
    not_found_parquet = find_parquet_file("nonexistent.parquet", tmp_dir)
    print("\nfind_parquet_file result for 'nonexistent.parquet' (should be None):")
    print(not_found_parquet)


find_file result (should find test.txt):
/tmp/tmpnfaw52x9/test.txt

find_file result for 'nonexistent.txt' (should be None):
None
Found Parquet directory: data.parquet

find_parquet_file result (should find directory 'data.parquet'):
/tmp/tmpq1khsylo/data.parquet
Found Parquet file: datafile.parquet

find_parquet_file result (should find file 'datafile.parquet'):
/tmp/tmpq1khsylo/subfolder/datafile.parquet

find_parquet_file result for 'nonexistent.parquet' (should be None):
None
