In [2]:
import pandas as pd
import numpy as np

def handle_null_urls(df: pd.DataFrame) -> pd.DataFrame:
    """
    Replaces null (NaN) values in the 'url' column with an empty string.

    This method is preferred over using a placeholder URL, as it avoids
    introducing misleading information or noise into the feature set.
    The absence of a URL is a feature in itself.

    Args:
        df: A pandas DataFrame containing a 'url' column.

    Returns:
        The DataFrame with null values in the 'url' column replaced
        by an empty string.
    """
    
    # Create a copy of the DataFrame to avoid modifying the original data
    df_copy = df.copy()

    # Use the fillna() method to replace all NaN values with an empty string
    # This is an efficient and standard practice for handling missing string data.
    df_copy['url'] = df_copy['url'].fillna('')
    
    return df_copy

# --- Main Script to Load and Process Data ---
# Please update this file path if your CSV is in a different location.
csv_file_path = r'D:\MACHINE_LEARNING\UVCE_NLP\data\raw\scam_dataset.csv'

try:
    # Load the dataset from the specified CSV file path
    print(f"Loading data from: {csv_file_path}")
    df = pd.read_csv(csv_file_path)
    
    # Print the original DataFrame to see the null values
    print("Original DataFrame info:")
    df.info()
    
    # Apply the function to the loaded DataFrame
    processed_df = handle_null_urls(df)
    
    # Print the information for the processed DataFrame to confirm changes
    print("\n" + "="*50 + "\n")
    print("Processed DataFrame info:")
    processed_df.info()
    
    # You can also check a specific row to see the change, for example:
    # print(processed_df[processed_df['url'] == ''])
    
except FileNotFoundError:
    print(f"Error: The file was not found at the specified path: {csv_file_path}")
    print("Please double-check the file path and try again.")
except Exception as e:
    print(f"An error occurred: {e}")



Loading data from: D:\MACHINE_LEARNING\UVCE_NLP\data\raw\scam_dataset.csv
Original DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  1000 non-null   object
 1   url      384 non-null    object
 2   label    1000 non-null   object
dtypes: object(3)
memory usage: 23.6+ KB


Processed DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  1000 non-null   object
 1   url      1000 non-null   object
 2   label    1000 non-null   object
dtypes: object(3)
memory usage: 23.6+ KB


In [7]:
df=pd.read_csv("\raw\scam_dataset.csv")

  df=pd.read_csv("\raw\scam_dataset.csv")
  df=pd.read_csv("\raw\scam_dataset.csv")


OSError: [Errno 22] Invalid argument: '\raw\\scam_dataset.csv'

In [8]:
df

Unnamed: 0,message,url,label
0,Your AXIS account has been blocked. Pls click ...,,spam
1,Job for students. Work from home daily. Earn 5...,https://fkrt.in/re-pay,spam
2,Aapka account block ho gaya hai. Pls click on ...,https://docu-share.in/update,spam
3,Your bank account has been frozen due to suspi...,https://verify-sbi.in.net/login,spam
4,"Hello, I am from your bank's fraud department....",https://aadhaar-verification-in.org,spam
...,...,...,...
995,Media cost group cause.,,ham
996,Spend could challenge environmental.,,ham
997,Herself example necessary approach set company...,,ham
998,Now partner agency election together environment.,,ham


In [10]:
df.notnull().count()

message    1000
url        1000
label      1000
dtype: int64