In [2]:
# Describing the data and handling of duplicate entries



import pandas as pd

# Load the Excel file
df = pd.read_excel('1) original_ data_excel_ v1.xlsx')

# 1. Get the shape of the data
shape = df.shape

# 2. Get general info about the data
info = df.info()

# 3. Get description of the data
description = df.describe()

# 4. Get characteristics and data types
characteristics = df.describe(include='all').transpose()

# 5. Detect duplicate rows and count corresponding duplicates
duplicate_sets = df[df.duplicated(keep=False)]
duplicate_sets_count = duplicate_sets.groupby(duplicate_sets.columns.tolist()).size().reset_index(name='Count')

# 6. Remove duplicate rows and count the number of rows deleted
num_deleted_rows = duplicate_sets.shape[0]
df.drop_duplicates(inplace=True)

# 7. Get count of duplicate row sets
duplicate_sets_total_count = len(duplicate_sets_count)

# 8. Save the modified data
df.to_excel('2) data_after_duplicate_v2.xlsx', index=False)

# Print relevant information
print("1. Shape of the data:", shape)
print("\n2. General info about the data:")
print(info)
print("\n3. Description of the data:")
print(description)
print("\n4. Characteristics and Data Types:")
print(characteristics)
print("\n5. Count of each corresponding duplicate row set:")
print(duplicate_sets_count)
print("\n6. Total count of duplicate row sets:", duplicate_sets_total_count)
print("\n7. The total number of duplicate rows deleted =", num_deleted_rows)
print("\n8. Modified data saved as '2) data_after_duplicate_v2.xlsx'")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435742 entries, 0 to 435741
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   state                        435742 non-null  object        
 1   location                     435739 non-null  object        
 2   location_monitoring_station  408251 non-null  object        
 3   agency                       286261 non-null  object        
 4   type                         430349 non-null  object        
 5   so2                          401096 non-null  float64       
 6   no2                          419509 non-null  float64       
 7   rspm                         395520 non-null  float64       
 8   spm                          198355 non-null  float64       
 9   pm2_5                        9314 non-null    float64       
 10  date                         435735 non-null  datetime64[ns]
dtypes: datetime64[ns](1), floa

In [12]:
# Handling of NaN/Null/Missing values for categorical columns



import pandas as pd

# Load the Excel file
df = pd.read_excel('2) data_after_duplicate_v2.xlsx')

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Forward fill NaN values in categorical columns
df[categorical_columns] = df[categorical_columns].ffill()

# Backward fill NaN values in categorical columns
df[categorical_columns] = df[categorical_columns].bfill()

# Save the modified data
df.to_excel('3) data_after_fillna_v3.xlsx', index=False)

print("Data after filling NaN values saved as '3) data_after_fillna_v3.xlsx'")


Data after filling NaN values saved as '3) data_after_fillna_v3.xlsx'


In [17]:
# Handling of NaN/Null/Missing values for numerical columns



import pandas as pd

# Load the Excel file
df = pd.read_excel('3) data_after_fillna_v3.xlsx')

# Identify numerical columns
numerical_columns = df.select_dtypes(include=['number']).columns

# Forward fill NaN values in numerical columns
df[numerical_columns] = df[numerical_columns].ffill(limit=None)

# Backward fill NaN values in numerical columns
df[numerical_columns] = df[numerical_columns].bfill(limit=None)

# Save the modified data
df.to_excel('4) data_after_bfill_ffill_v4.xlsx', index=False)

print("Data after filling NaN values for numerical columns saved as '4) data_after_bfill_ffill_v4.xlsx'")


Data after polynomial interpolation saved as '4) data_after_bfill_ffill_v4.xlsx'


In [None]:
# Calculate AQI

In [2]:
# Handling of NaN/Null/Missing values for categorical columns



import pandas as pd

# Load the Excel file
df = pd.read_excel('AQIdata.xlsx')

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Forward fill NaN values in categorical columns
df[categorical_columns] = df[categorical_columns].ffill()

# Backward fill NaN values in categorical columns
df[categorical_columns] = df[categorical_columns].bfill()

# Save the modified data
df.to_excel('6) data_after_fillna_v6_final.xlsx', index=False)

print("Data after filling NaN values saved as '6) data_after_fillna_v6_final.xlsx'")


Data after filling NaN values saved as '6) data_after_fillna_v6_final.xlsx'


In [4]:
# Handling of NaN/Null/Missing values for numerical columns



import pandas as pd

# Load the Excel file
df = pd.read_excel('6) data_after_fillna_v6_final.xlsx')

# Identify numerical columns
numerical_columns = df.select_dtypes(include=['number']).columns

# Forward fill NaN values in numerical columns
df[numerical_columns] = df[numerical_columns].ffill(limit=None)

# Backward fill NaN values in numerical columns
df[numerical_columns] = df[numerical_columns].bfill(limit=None)

# Save the modified data
df.to_excel('data_final.xlsx', index=False)

print("Data after filling NaN values for numerical columns saved as 'data_final.xlsx'")


Data after filling NaN values for numerical columns saved as 'data_final_after_preprocessing.xlsx'
