In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/new-data/new_data.csv
/kaggle/input/all-other-set/second_set.csv
/kaggle/input/all-other-set/third_set.csv


In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

def preprocess_air_quality_data(file_path):
    # Load data
    df = pd.read_csv(file_path, low_memory=False)
    df.drop(columns=['StationId'], inplace=True)
    
    # Handle missing values
    numeric_columns = df.select_dtypes(include=['float64']).columns
    imputer = SimpleImputer(strategy='mean')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # Process datetime and create time features
    df['Datetime'] = pd.to_datetime(df['Datetime'], format='mixed', errors='coerce')
    df['Year'] = df['Datetime'].dt.year
    df['Month'] = df['Datetime'].dt.month
    df['Day'] = df['Datetime'].dt.day
    df['Hour'] = df['Datetime'].dt.hour
    
    # Categorize PM2.5 values
    bins = [0, 12, 35.4, 55.4, 150.4, 250.4, float('inf')]
    labels = ['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
    df['PM2.5_Category'] = pd.cut(df['PM2.5'], bins=bins, labels=labels)
    
    # Perform undersampling
    min_class_size = df['PM2.5_Category'].value_counts().min()
    balanced_dfs = []
    
    for category in df['PM2.5_Category'].unique():
        category_df = df[df['PM2.5_Category'] == category]
        if len(category_df) > min_class_size:
            balanced_dfs.append(category_df.sample(min_class_size))
        else:
            balanced_dfs.append(category_df)
    
    df_balanced = pd.concat(balanced_dfs, ignore_index=True)
    
    return df_balanced


# For each dataset
df1 = preprocess_air_quality_data('/kaggle/input/new-data/new_data.csv')
df2 = preprocess_air_quality_data('/kaggle/input/all-other-set/second_set.csv')
df3 = preprocess_air_quality_data('/kaggle/input/all-other-set/third_set.csv')

# Combine all datasets
final_df = pd.concat([df1, df2, df3], ignore_index=True)

# Print sizes to see the reduction
print("Original sizes:", len(df1), len(df2), len(df3))
print("Combined balanced size:", len(final_df))

Original sizes: 159804 119304 73050
Combined balanced size: 352158


In [4]:
final_df.to_csv('/kaggle/working/balanced_air_quality_data.csv')

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.preprocessing import MinMaxScaler, LabelEncoder
# from sklearn.model_selection import train_test_split
# from sklearn.impute import SimpleImputer
# import warnings
# warnings.filterwarnings('ignore')

# def preprocess_air_quality_data(file_path, num_samples=50000, sequence_length=100):
#     """
#     Advanced preprocessing for air quality data with focus on PM2.5 prediction
#     """
#     # Load and initialize data
#     df = pd.read_csv(file_path, low_memory=False)  # Added low_memory=False to resolve mixed types warning
#     df.drop(columns=['StationId'], inplace=True)
# #     df = df[:num_samples]
    
#     # Handle missing values
#     numeric_columns = df.select_dtypes(include=['float64']).columns
#     imputer = SimpleImputer(strategy='mean')
#     df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
#     # Process datetime features
#     df['Datetime'] = pd.to_datetime(df['Datetime'], format='mixed', errors='coerce')

    
#     # Extract time-based features
#     df['Year'] = df['Datetime'].dt.year
#     df['Month'] = df['Datetime'].dt.month
#     df['Day'] = df['Datetime'].dt.day
#     df['Hour'] = df['Datetime'].dt.hour
#     df['DayOfWeek'] = df['Datetime'].dt.dayofweek
#     df['IsWeekend'] = (df['Datetime'].dt.dayofweek >= 5).astype(int)
    
#     # Add seasonal features
#     df['Season'] = df['Month'].map(lambda x: 1 if x in [12,1,2] else 2 if x in [3,4,5] else 3 if x in [6,7,8] else 4)
    
#     # Calculate rolling averages for PM2.5
#     df['PM2.5_Rolling_Mean_3h'] = df['PM2.5'].rolling(window=3, min_periods=1).mean()
#     df['PM2.5_Rolling_Mean_24h'] = df['PM2.5'].rolling(window=24, min_periods=1).mean()
    
#     # Create PM2.5 categories based on WHO guidelines
#     bins = [0, 12, 35.4, 55.4, 150.4, 250.4, float('inf')]
#     labels = ['Good', 'Moderate', 'Unhealthy for Sensitive', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
#     df['PM2.5_Category'] = pd.cut(df['PM2.5'], bins=bins, labels=labels)
    
#     # Drop datetime column
#     df.drop(columns=['Datetime'], inplace=True)
    
#     # Balance classes using stratified sampling with modified groupby
#     min_class_size = df['PM2.5_Category'].value_counts().min()
    
#     # Modified groupby to resolve deprecation warnings
#     balanced_dfs = []
#     for category in df['PM2.5_Category'].unique():
#         category_df = df[df['PM2.5_Category'] == category]
#         if len(category_df) > min_class_size:
#             balanced_dfs.append(category_df.sample(min_class_size))
#         else:
#             balanced_dfs.append(category_df)
    
#     df_balanced = pd.concat(balanced_dfs, ignore_index=True)
    
#     # Encode categorical variables
#     label_encoder = LabelEncoder()
#     df_balanced['PM2.5_Category_Encoded'] = label_encoder.fit_transform(df_balanced['PM2.5_Category'])
    
#     # Scale features
#     scaler = MinMaxScaler()
#     columns_to_scale = df_balanced.select_dtypes(include=['float64', 'int64']).columns
#     df_balanced_scaled = df_balanced.copy()
#     df_balanced_scaled[columns_to_scale] = scaler.fit_transform(df_balanced[columns_to_scale])
    
#     # Prepare sequences for LSTM
#     def create_sequences(data, seq_length):
#         sequences = []
#         targets = []
        
#         for i in range(len(data) - seq_length):
#             # Get sequence of features
#             sequence = data.iloc[i:(i + seq_length)]
            
#             # Get target (PM2.5 value at the next timestep)
#             target = data.iloc[i + seq_length]['PM2.5']
            
#             sequences.append(sequence.values)
#             targets.append(target)
            
#         return np.array(sequences), np.array(targets)
    
#     # Create sequences and targets
#     X, y = create_sequences(df_balanced_scaled, sequence_length)
    
#     # Split into training and testing sets
#     X_train, X_test, y_train, y_test = train_test_split(
#         X, y, test_size=0.2, random_state=42
#     )
    
#     # Create metadata dictionary
#     metadata = {
#         'feature_names': df_balanced_scaled.columns.tolist(),
#         'pm25_categories': list(zip(label_encoder.classes_, range(len(label_encoder.classes_)))),
#         'scaler': scaler,
#         'label_encoder': label_encoder
#     }
    
#     return {
#         'X_train': X_train,
#         'X_test': X_test,
#         'y_train': y_train,
#         'y_test': y_test,
#         'metadata': metadata
#     }

# try:
#     results1 = preprocess_air_quality_data(
#         file_path='/kaggle/input/new-data/new_data.csv',
#         num_samples=1048570,
#         sequence_length=100
#     )
    
#     results2 = preprocess_air_quality_data(
#         file_path='/kaggle/input/all-other-set/second_set.csv',
#         num_samples=1048570,
#         sequence_length=100
#     )
    
#     results3 = preprocess_air_quality_data(
#         file_path='/kaggle/input/all-other-set/third_set.csv',
#         num_samples=500000,
#         sequence_length=100
#     )
    
#     # Combine results using numpy.concatenate
#     results = {
#         'X_train': np.concatenate([results1['X_train'], results2['X_train'], results3['X_train']], axis=0),
#         'X_test': np.concatenate([results1['X_test'], results2['X_test'], results3['X_test']], axis=0),
#         'y_train': np.concatenate([results1['y_train'], results2['y_train'], results3['y_train']], axis=0),
#         'y_test': np.concatenate([results1['y_test'], results2['y_test'], results3['y_test']], axis=0),
#         'metadata': results1['metadata']  # Metadata can remain the same as they are shared across runs
#     }
    
#     print("Training data shape:", results['X_train'].shape)
#     print("Testing data shape:", results['X_test'].shape)
#     print("Training targets shape:", results['y_train'].shape)
#     print("Testing targets shape:", results['y_test'].shape)
#     print("\nFeature names:", results['metadata']['feature_names'])
#     print("\nPM2.5 categories:", results['metadata']['pm25_categories'])

# except Exception as e:
#     print(f"Error occurred: {str(e)}")