In [20]:
import pandas as pd
import numpy as np
import random
import os

# Data preprocessing

In [18]:
def count_zeros_in_columns(csv_file, columns_to_process):
    """
    Counts the number of zeros in specified columns of a CSV file and calculates the percentage of zeros.

    Parameters:
    csv_file (str): The path to the CSV file.
    columns_to_process (list): A list of column names to process.

    Returns:
    dict: A dictionary containing the statistics for each column. The keys are the column names and the values are dictionaries with 'Zero Count' and 'Zero Percentage (%)' as keys.
    """
    data = pd.read_csv(csv_file)

    # Initialize a dictionary to store the statistics for each column
    column_stats = {}

    # Iterate over the specified column names
    for column_name in columns_to_process:
        # Make sure the column name exists in the data
        if column_name in data.columns:
            # Count the number of data points with a value of 0 in the column
            zero_count = (data[column_name] == 0).sum()
            # Calculate the percentage of data points with a value of 0
            zero_percentage = (zero_count / len(data)) * 100

            # Store the statistics
            column_stats[column_name] = {
                'Zero Count': zero_count,
                'Zero Percentage (%)': zero_percentage
            }
        else:
            # If the column name does not exist in the data, provide a corresponding message
            column_stats[column_name] = {
                'Zero Count': 'Column not found',
                'Zero Percentage (%)': 'Column not found'
            }

    return column_stats


# Test the function
csv_file_paths = ['Beijing_original.csv', 'Shanghai_original.csv', 'Guangzhou_original.csv', 'Shenzhen_original.csv']
columns_to_process = ['carbon_emissions (ton)', 'population (unit)', 'gdp (million yuan)']  
for csv_file_path in csv_file_paths:
    print("--------------------------------------------------")
    print(f"Processing {csv_file_path}...")
    print("--------------------------------------------------")
    result = count_zeros_in_columns(csv_file_path, columns_to_process)
    for column_name, stats in result.items():
        print(f"Column: {column_name}")
        print(f"Zero Count: {stats['Zero Count']}")
        print(f"Zero Percentage: {stats['Zero Percentage (%)']}%\n")

--------------------------------------------------
Processing Beijing_original.csv...
--------------------------------------------------
Column: carbon_emissions (ton)
Zero Count: 6
Zero Percentage: 0.13080444735120994%

Column: population (unit)
Zero Count: 0
Zero Percentage: 0.0%

Column: gdp (million yuan)
Zero Count: 15
Zero Percentage: 0.3270111183780248%

--------------------------------------------------
Processing Shanghai_original.csv...
--------------------------------------------------
Column: carbon_emissions (ton)
Zero Count: 676
Zero Percentage: 13.080495356037153%

Column: population (unit)
Zero Count: 514
Zero Percentage: 9.945820433436532%

Column: gdp (million yuan)
Zero Count: 4
Zero Percentage: 0.07739938080495357%

--------------------------------------------------
Processing Guangzhou_original.csv...
--------------------------------------------------
Column: carbon_emissions (ton)
Zero Count: 133
Zero Percentage: 3.926778860348391%

Column: population (unit)
Zero 

In [25]:
def remove_rows_with_zeros(csv_file, columns_to_check):
    """
    Remove rows from a CSV file that contain zeros in specified columns.

    Args:
        csv_file (pandas.DataFrame): The CSV file as a pandas DataFrame.
        columns_to_check (list): A list of column names to check for zeros.

    Returns:
        pandas.DataFrame: The updated CSV file with rows containing zeros removed.
    """
    data = csv_file

    for index, row in data.iterrows():
        if any(row[column] == 0 for column in columns_to_check):
            data = data.drop(index)
    return data


def apply_log_transformation(csv_file, columns_to_transform):
    """
    Applies a logarithmic transformation to specified columns in a CSV file.

    Parameters:
    csv_file (pandas.DataFrame): The CSV file to transform.
    columns_to_transform (list): A list of column names to apply the transformation to.

    Returns:
    pandas.DataFrame: The transformed CSV file.
    """
    data = csv_file

    for column_name in columns_to_transform:
        if column_name in data.columns:
            data[column_name] = np.log(data[column_name])
        else:
            print(f"Column '{column_name}' not found in the CSV file.")

    return data

In [26]:
file_path = 'Shanghai_original.csv'  
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,satellite_img_name,BD09 coordinate,WGS84 coordinate,carbon_emissions (ton),population (unit),gdp (million yuan)
0,Shanghai/16_13167_3525_s.jpg,"(13167,3525)","(30.99272409355233, 121.11860375494845)",248.64958,1139,15321.365
1,Shanghai/16_13232_3555_s.jpg,"(13232,3555)","(31.23017054642347, 121.71651590238979)",730.37244,5294,47379.03
2,Shanghai/16_13223_3531_s.jpg,"(13223,3531)","(31.04026134561536, 121.63372806659022)",377.9839,3943,8707.273
3,Shanghai/16_13239_3552_s.jpg,"(13239,3552)","(31.206452906350798, 121.78090644134501)",675.47314,1154,0.0
4,Shanghai/16_13208_3537_s.jpg,"(13208,3537)","(31.087774637460512, 121.4957483402576)",965.55444,3901,48677.527


In [27]:
column_name_mapping = {
    'BD09 coordinate': 'Coordinate',
    'carbon_emissions (ton)': 'carbon',
    'population (unit)': 'population',
    'gdp (million yuan)': 'gdp'
}

columns_to_delete = ['satellite_img_name', 'WGS84 coordinate']
columns_to_check = ['carbon', 'population', 'gdp']
columns_to_transform = ['carbon', 'population', 'gdp']

data = data.rename(columns=column_name_mapping)
data = data.drop(columns=columns_to_delete)
data = remove_rows_with_zeros(data, columns_to_check)
data = apply_log_transformation(data, columns_to_transform)

data.to_csv('Shanghai.csv', index=False)

# Split data to train/test

In [10]:
def process_and_save(file_paths):
    """
    Process the given CSV files by normalizing the specified columns and save the processed data to new files.

    Args:
        file_paths (list): A list of file paths to the CSV files.

    Returns:
        None
    """
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        columns_to_process = ['carbon', 'population', 'gdp']
        means = df[columns_to_process].mean()
        mins = df[columns_to_process].min()
        maxs = df[columns_to_process].max()

        for column in columns_to_process:
            range_width = maxs[column] - mins[column]
            # Normalize the data, scales the values to be centered around the mean and within a range of -0.5 to 0.5.
            df[column] = df[column].apply(lambda x: ((x - mins[column]) / range_width - 0.5) * 1.0 + means[column])

        output_file_path = file_path.replace('.csv', '_refined.csv')

        df.to_csv(output_file_path, index=False)

        print(f"File {file_path} processed and saved to {output_file_path}")

csv_files = [
    'Beijing.csv',
    'Shanghai.csv',
    'Guangzhou.csv',
    'Shenzhen.csv'
]

process_and_save(csv_files)

File Beijing.csv processed and saved to Beijing_refined.csv
File Shanghai.csv processed and saved to Shanghai_refined.csv
File Guangzhou.csv processed and saved to Guangzhou_refined.csv
File Shenzhen.csv processed and saved to Shenzhen_refined.csv


In [28]:
def split_and_save_datasets(csv_filenames, train_ratio=0.8):
    """
    Splits the given CSV files into training and testing datasets based on the specified train_ratio.
    Saves the training and testing datasets as new CSV files.

    Parameters:
    - csv_filenames (list): List of CSV file names to be processed.
    - train_ratio (float): The ratio of data to be used for training. Default is 0.8.

    Returns:
    None
    """
    for filename in csv_filenames:
        # Read the CSV file
        df = pd.read_csv(filename)
        
        # Shuffle the data randomly
        df = df.sample(frac=1, random_state=42)
        
        # Calculate the split point for training and testing
        split_point = int(len(df) * train_ratio)
        
        # Split the data into training and testing sets
        train_data = df[:split_point]
        test_data = df[split_point:]
        
        # Save the training and testing sets as new CSV files
        train_filename = os.path.splitext(filename)[0] + "_train.csv"
        test_filename = os.path.splitext(filename)[0] + "_test.csv"
        
        train_data.to_csv(train_filename, index=False)
        test_data.to_csv(test_filename, index=False)
        
        print(f"Saved training dataset file: {train_filename}")
        print(f"Saved testing dataset file: {test_filename}")

csv_filenames = ["Beijing_refined.csv", "Shanghai_refined.csv", "Guangzhou_refined.csv", "Shenzhen_refined.csv"]
split_and_save_datasets(csv_filenames)

Saved training dataset file: Beijing_train.csv
Saved testing dataset file: Beijing_test.csv
Saved training dataset file: Shanghai_train.csv
Saved testing dataset file: Shanghai_test.csv
Saved training dataset file: Guangzhou_train.csv
Saved testing dataset file: Guangzhou_test.csv
Saved training dataset file: Shenzhen_train.csv
Saved testing dataset file: Shenzhen_test.csv
