In [None]:
# Load the new data from the provided CSV file
new_file_path = '/mnt/data/4번_역별_공기질_위경도_공기청정기정보.csv'
new_data = pd.read_csv(new_file_path)

# Display the first few rows of the new dataset
new_data.head()

In [None]:
# Display descriptive statistics for the new dataset
new_data.describe()


In [None]:
# Define the revised classification function for 5 categories
def classify_air_purifier_v3(row):
    # Define thresholds for air purifier metrics (10%, 25%, 75%, 90% quantiles)
    thresholds_air_purifier = {
        '설치대수_x': [new_data['설치대수_x'].quantile(0.1), new_data['설치대수_x'].quantile(0.25), new_data['설치대수_x'].quantile(0.75), new_data['설치대수_x'].quantile(0.9)],
        '적용면적_x': [new_data['적용면적_x'].quantile(0.1), new_data['적용면적_x'].quantile(0.25), new_data['적용면적_x'].quantile(0.75), new_data['적용면적_x'].quantile(0.9)],
        '처리용량_x': [new_data['처리용량_x'].quantile(0.1), new_data['처리용량_x'].quantile(0.25), new_data['처리용량_x'].quantile(0.75), new_data['처리용량_x'].quantile(0.9)]
    }
    
    # Define thresholds for air quality metrics (10%, 25%, 75%, 90% quantiles)
    thresholds_air_quality = {
        'PM10': [new_data['PM10'].quantile(0.1), new_data['PM10'].quantile(0.25), new_data['PM10'].quantile(0.75), new_data['PM10'].quantile(0.9)],
        'PM2.5': [new_data['PM2.5'].quantile(0.1), new_data['PM2.5'].quantile(0.25), new_data['PM2.5'].quantile(0.75), new_data['PM2.5'].quantile(0.9)],
        'CO2': [new_data['CO2'].quantile(0.1), new_data['CO2'].quantile(0.25), new_data['CO2'].quantile(0.75), new_data['CO2'].quantile(0.9)],
        'HUMIDITY': [new_data['HUMIDITY'].quantile(0.1), new_data['HUMIDITY'].quantile(0.25), new_data['HUMIDITY'].quantile(0.75), new_data['HUMIDITY'].quantile(0.9)],
        'AIRPRESSURE': [new_data['AIRPRESSURE'].quantile(0.1), new_data['AIRPRESSURE'].quantile(0.25), new_data['AIRPRESSURE'].quantile(0.75), new_data['AIRPRESSURE'].quantile(0.9)],
        'TEMPERATURE': [new_data['TEMPERATURE'].quantile(0.1), new_data['TEMPERATURE'].quantile(0.25), new_data['TEMPERATURE'].quantile(0.75), new_data['TEMPERATURE'].quantile(0.9)]
    }
    
    status = '적합'  # Default status
    
    for key in ['설치대수_x', '적용면적_x', '처리용량_x']:
        if row[key] <= thresholds_air_purifier[key][0] and any(row[aq_key] > thresholds_air_quality[aq_key][3] for aq_key in thresholds_air_quality):
            return '심각한 부족'
        elif row[key] <= thresholds_air_purifier[key][1] and any(row[aq_key] > thresholds_air_quality[aq_key][2] for aq_key in thresholds_air_quality):
            return '부족'
        elif row[key] >= thresholds_air_purifier[key][3] and any(row[aq_key] < thresholds_air_quality[aq_key][0] for aq_key in thresholds_air_quality):
            return '심각한 과잉'
        elif row[key] >= thresholds_air_purifier[key][2] and any(row[aq_key] < thresholds_air_quality[aq_key][1] for aq_key in thresholds_air_quality):
            return '과잉'
    
    return status

# Apply the revised classification function to each row in the DataFrame
new_data['공기청정기 효용 상태'] = new_data.apply(classify_air_purifier_v3, axis=1)

import ace_tools as tools; tools.display_dataframe_to_user(name="Updated Air Purifier Efficiency Classification", dataframe=new_data)

# Display the first few rows to verify the new column
new_data[['역명', '설치대수_x', '적용면적_x', '처리용량_x', 'PM10', 'PM2.5', 'CO2', 'HUMIDITY', 'AIRPRESSURE', 'TEMPERATURE', '공기청정기 효용 상태']].head()


In [None]:
# Remove duplicate rows based on all columns
new_data_unique = new_data.drop_duplicates()

import ace_tools as tools; tools.display_dataframe_to_user(name="Unique Air Purifier Efficiency Classification", dataframe=new_data_unique)

# Display the first few rows to verify the removal of duplicate rows
new_data_unique.head()
