In [5]:
import os
import csv
import json
import re
import chardet

In [11]:
# Get the base directory
base_dir = os.getcwd()

# Define input and output file paths
input_file_path = os.path.join(base_dir, "Input_Files", "spam.csv")
output_file_path = os.path.join(base_dir, "Input_Json_Files", "spam.json")

# Detect encoding
with open(input_file_path, 'rb') as raw_file:
    result = chardet.detect(raw_file.read())
    encoding_type = result['encoding']

# Function to clean text
def clean_text(text):
    text = text.strip()  # Remove leading/trailing spaces
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = text.replace(',,', '')  # Remove unnecessary commas
    return text

# List to store cleaned data
cleaned_data = []

try:
    # Read and clean the raw dataset with detected encoding
    with open(input_file_path, 'r', encoding=encoding_type, errors='replace') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the first row (header)

        for row in reader:
            if len(row) >= 2:  # Ensure row has at least label and message
                label = row[0].strip().lower()  # Convert label to lowercase
                message = clean_text(row[1])
                cleaned_data.append({"label": label, "message": message})

    # Create the necessary directories if they don't exist
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    # Save cleaned data as JSON
    with open(output_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(cleaned_data, json_file, indent=4, ensure_ascii=False)

    print(f"Conversion completed! JSON file '{output_file_path}' is ready.")
except FileNotFoundError:
    print(f"Error: The file '{input_file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Conversion completed! JSON file 'd:\Project\AI_and_ML\Supervised_Learning\Spam_Detection\Input_Json_Files\spam.json' is ready.
