In [1]:
import pandas as pd
crime = pd.read_csv('Crime_Merged.csv')
data = pd.read_csv('London_LSOA.csv')

In [3]:
print(data.info())

missing_per_variable = data.isna().sum()
print(missing_per_variable)

数据基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5264 entries, 0 to 5263
Data columns (total 29 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Borough                             5264 non-null   object 
 1   LSOA name                           5264 non-null   object 
 2   LSOA code                           5264 non-null   object 
 3   crime                               5264 non-null   float64
 4   population                          5264 non-null   float64
 5   special_ethnic                      5088 non-null   float64
 6   area_sqm                            5088 non-null   float64
 7   greenspace_ratio                    5088 non-null   float64
 8   access_point_density                5088 non-null   float64
 9   greenspace_access_interaction       5088 non-null   float64
 10  police_coverage_ratio               5088 non-null   float64
 11  NDLI                               

In [4]:
import numpy as np

# Ensure there are no outliers or empty strings
data = data.replace(["", "N/A"], pd.NA)

# Exclude variables that are not involved in statistics
columns_to_exclude = ['Borough', 'LSOA name', 'LSOA code']
columns_to_check = [col for col in data.columns if col not in columns_to_exclude]

# Convert ":" in the 'house mean price' column to pd.NA
data['house mean price'] = data['house mean price'].replace(":", pd.NA)

data = data.replace({pd.NA: np.nan})

# Clean 'house mean price': remove commas and convert to float
data['house mean price'] = data['house mean price'].str.replace(',', '').astype(float)

# Ensure 'population' is a float (if MICE requires all variables to be float)
data['population'] = data['population'].astype(float)
data['population'] = data['population'].replace(0, pd.NA)  # Replace values of 0 with pd.NA

# Check for consistent column names
data.rename(columns=lambda x: x.strip(), inplace=True)

# Missing individual statistics
total_samples = len(data)
missing_individuals = data[columns_to_check].isna().any(axis=1).sum()  # Count of samples with at least one missing variable
missing_individuals_ratio = missing_individuals / total_samples  # Proportion of total samples

print(f"Number of missing individuals (excluding specified variables): {missing_individuals}")

print(f"Proportion of missing individuals (excluding specified variables): {missing_individuals_ratio:.2%}")

missing_per_variable = data.isna().sum()
print(missing_per_variable)


缺失个体数量（排除指定变量后）：730
缺失个体占总样本比例（排除指定变量后）：13.87%
Borough                                 0
LSOA name                               0
LSOA code                               0
crime                                   0
population                            176
special_ethnic                        176
area_sqm                              176
greenspace_ratio                      176
access_point_density                  176
greenspace_access_interaction         176
police_coverage_ratio                 176
NDLI                                  176
Rented from Local Authority           176
bedless                               176
shared house                          176
households                            176
none                                  176
employment_rate                       176
residents workable                    176
long term unemployed                  176
low level occupation                  176
less work hours                       176
teenager                     

In [None]:
# Use MICE to impute house prices
from fancyimpute import IterativeImputer

# Convert the format of population and crime to float
data['crime'] = data['crime'].astype(float)
data['population'] = pd.to_numeric(data['population'], errors='coerce')

# Create a copy of the dataset
data_fill = data

# Exclude variables that are not involved in imputation
columns_to_exclude = ['Borough', 'LSOA name', 'LSOA code']
columns_to_include = [col for col in data.columns if col not in columns_to_exclude]

# Extract data for MICE
mice_data = data_fill[columns_to_include]

# Perform MICE imputation
mice_imputer = IterativeImputer(max_iter=200, random_state=0)
mice_filled = mice_imputer.fit_transform(mice_data)

# Convert the imputation result back to DataFrame
mice_filled_df = pd.DataFrame(mice_filled, columns=mice_data.columns)

# Replace the imputed 'house mean price' back into the original dataset
data_fill['house mean price'] = mice_filled_df['house mean price']

# Check imputation results
print(data_fill['house mean price'].isna().sum())  # Should output 0, indicating no missing values


In [None]:
# There are still 98 LSOAs with missing data, imputing them as well
from fancyimpute import IterativeImputer

# Exclude variables that are not involved in imputation
columns_to_exclude = ['Borough', 'LSOA name', 'LSOA code']
columns_to_include = [col for col in data_fill.columns if col not in columns_to_exclude]

# Extract the portion for imputation
data_to_impute = data_fill[columns_to_include]

# Use IterativeImputer to fill all missing values
imputer = IterativeImputer(max_iter=200, random_state=0)
imputed_array = imputer.fit_transform(data_to_impute)

# Replace the imputed data back into the original dataset
data_fill[columns_to_include] = imputed_array

# Check the imputed data
print(data_fill.info())

# Check if there are still missing values
print("Remaining missing values count:")
print(data_fill.isna().sum())

data_fill.to_csv("London_cleared.csv", index=False)