#### Rex Gayas DSC350-T301 Data Wrangling for Data Scienc (2243-1)
#### Term Project Milestone 04 FEB 2024

In [27]:
import requests

# Define the API endpoint with New York as the city
api_url = "https://api.waqi.info/feed/newyork/?token=95481362141897eac4f8263ff574654eccfed309"

# Make a GET request to fetch the data
response = requests.get(api_url)

# Parse the JSON response
data = response.json()

# Check the status of the request
if data['status'] == 'ok':
    print("Data fetched successfully!")
    # Print a summary of the data
    city = data['data']['city']['name']
    aqi = data['data']['aqi']
    print(f"City: {city}, AQI: {aqi}")
else:
    print("Failed to fetch data")


Data fetched successfully!
City: New York, AQI: 32


##### Simulate Data Retrieval Over Time

In [29]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Simulate data retrieval over the past week, with two data points per day
dates = pd.date_range(end=datetime.now(), periods=14, freq='12H')
aqi_values = np.random.choice(range(10, 150), size=14)  # Simulated AQI values within a broad range
cities = ["New York"] * 14  # Simulating data for New York

# Create a DataFrame with the simulated data
df = pd.DataFrame({
    'DateTime': dates,
    'AirQualityIndex': aqi_values,
    'City': cities
})
print(df)

                     DateTime  AirQualityIndex      City
0  2024-01-29 13:21:29.771351               61  New York
1  2024-01-30 01:21:29.771351               86  New York
2  2024-01-30 13:21:29.771351               39  New York
3  2024-01-31 01:21:29.771351              120  New York
4  2024-01-31 13:21:29.771351               32  New York
5  2024-02-01 01:21:29.771351               82  New York
6  2024-02-01 13:21:29.771351              122  New York
7  2024-02-02 01:21:29.771351               35  New York
8  2024-02-02 13:21:29.771351              124  New York
9  2024-02-03 01:21:29.771351               40  New York
10 2024-02-03 13:21:29.771351              140  New York
11 2024-02-04 01:21:29.771351               65  New York
12 2024-02-04 13:21:29.771351               47  New York
13 2024-02-05 01:21:29.771351              125  New York


##### Introduce Simulated Errors and Inconsistencies

In [30]:
# Introduce some simulated errors/inconsistencies in the data
df.loc[5, 'AirQualityIndex'] = -20  # Impossible negative AQI value
df.loc[7, 'City'] = "new york"  # Inconsistent casing
df.loc[8, 'DateTime'] = "2024-25-02 12:00:00"  # Impossible date, to be cleaned later
print(df)

                      DateTime  AirQualityIndex      City
0   2024-01-29 13:21:29.771351               61  New York
1   2024-01-30 01:21:29.771351               86  New York
2   2024-01-30 13:21:29.771351               39  New York
3   2024-01-31 01:21:29.771351              120  New York
4   2024-01-31 13:21:29.771351               32  New York
5   2024-02-01 01:21:29.771351              -20  New York
6   2024-02-01 13:21:29.771351              122  New York
7   2024-02-02 01:21:29.771351               35  new york
8          2024-25-02 12:00:00              124  New York
9   2024-02-03 01:21:29.771351               40  New York
10  2024-02-03 13:21:29.771351              140  New York
11  2024-02-04 01:21:29.771351               65  New York
12  2024-02-04 13:21:29.771351               47  New York
13  2024-02-05 01:21:29.771351              125  New York


  df.loc[8, 'DateTime'] = "2024-25-02 12:00:00"  # Impossible date, to be cleaned later


##### Replace Headers

In [31]:
# Replace Headers
df.columns = ['SampleDateTime', 'AQI_Value', 'Location']
print(df)

                SampleDateTime  AQI_Value  Location
0   2024-01-29 13:21:29.771351         61  New York
1   2024-01-30 01:21:29.771351         86  New York
2   2024-01-30 13:21:29.771351         39  New York
3   2024-01-31 01:21:29.771351        120  New York
4   2024-01-31 13:21:29.771351         32  New York
5   2024-02-01 01:21:29.771351        -20  New York
6   2024-02-01 13:21:29.771351        122  New York
7   2024-02-02 01:21:29.771351         35  new york
8          2024-25-02 12:00:00        124  New York
9   2024-02-03 01:21:29.771351         40  New York
10  2024-02-03 13:21:29.771351        140  New York
11  2024-02-04 01:21:29.771351         65  New York
12  2024-02-04 13:21:29.771351         47  New York
13  2024-02-05 01:21:29.771351        125  New York


##### Format Data into a More Readable Format

In [32]:
# Correct the DateTime format and convert all entries to actual datetime objects
df['SampleDateTime'] = pd.to_datetime(df['SampleDateTime'], errors='coerce')  # This will convert errors to NaT

# Make sure the AQI_Value column is numeric and handle any non-numeric entries by converting them to NaN
df['AQI_Value'] = pd.to_numeric(df['AQI_Value'], errors='coerce')
print(df)

               SampleDateTime  AQI_Value  Location
0  2024-01-29 13:21:29.771351         61  New York
1  2024-01-30 01:21:29.771351         86  New York
2  2024-01-30 13:21:29.771351         39  New York
3  2024-01-31 01:21:29.771351        120  New York
4  2024-01-31 13:21:29.771351         32  New York
5  2024-02-01 01:21:29.771351        -20  New York
6  2024-02-01 13:21:29.771351        122  New York
7  2024-02-02 01:21:29.771351         35  new york
8                         NaT        124  New York
9  2024-02-03 01:21:29.771351         40  New York
10 2024-02-03 13:21:29.771351        140  New York
11 2024-02-04 01:21:29.771351         65  New York
12 2024-02-04 13:21:29.771351         47  New York
13 2024-02-05 01:21:29.771351        125  New York


##### Identify Outliers and Bad Data

In [33]:
# Remove rows with negative AQI values which are considered bad data
df = df[df['AQI_Value'] >= 0]
print(df)

               SampleDateTime  AQI_Value  Location
0  2024-01-29 13:21:29.771351         61  New York
1  2024-01-30 01:21:29.771351         86  New York
2  2024-01-30 13:21:29.771351         39  New York
3  2024-01-31 01:21:29.771351        120  New York
4  2024-01-31 13:21:29.771351         32  New York
6  2024-02-01 13:21:29.771351        122  New York
7  2024-02-02 01:21:29.771351         35  new york
8                         NaT        124  New York
9  2024-02-03 01:21:29.771351         40  New York
10 2024-02-03 13:21:29.771351        140  New York
11 2024-02-04 01:21:29.771351         65  New York
12 2024-02-04 13:21:29.771351         47  New York
13 2024-02-05 01:21:29.771351        125  New York


##### Find Duplicates

In [34]:
df.drop_duplicates(inplace=True)
print(df)

               SampleDateTime  AQI_Value  Location
0  2024-01-29 13:21:29.771351         61  New York
1  2024-01-30 01:21:29.771351         86  New York
2  2024-01-30 13:21:29.771351         39  New York
3  2024-01-31 01:21:29.771351        120  New York
4  2024-01-31 13:21:29.771351         32  New York
6  2024-02-01 13:21:29.771351        122  New York
7  2024-02-02 01:21:29.771351         35  new york
8                         NaT        124  New York
9  2024-02-03 01:21:29.771351         40  New York
10 2024-02-03 13:21:29.771351        140  New York
11 2024-02-04 01:21:29.771351         65  New York
12 2024-02-04 13:21:29.771351         47  New York
13 2024-02-05 01:21:29.771351        125  New York


##### Fix Casing or Inconsistent Values

In [35]:
df['Location'] = df['Location'].str.title()  # Convert city names to title case
print(df)

               SampleDateTime  AQI_Value  Location
0  2024-01-29 13:21:29.771351         61  New York
1  2024-01-30 01:21:29.771351         86  New York
2  2024-01-30 13:21:29.771351         39  New York
3  2024-01-31 01:21:29.771351        120  New York
4  2024-01-31 13:21:29.771351         32  New York
6  2024-02-01 13:21:29.771351        122  New York
7  2024-02-02 01:21:29.771351         35  New York
8                         NaT        124  New York
9  2024-02-03 01:21:29.771351         40  New York
10 2024-02-03 13:21:29.771351        140  New York
11 2024-02-04 01:21:29.771351         65  New York
12 2024-02-04 13:21:29.771351         47  New York
13 2024-02-05 01:21:29.771351        125  New York


##### Conduct Fuzzy Matching

In [36]:
from fuzzywuzzy import process

cities = ["New York", "Los Angeles", "Chicago"]
df['Location'] = df['Location'].apply(lambda x: process.extractOne(x, cities)[0])
print(df)

               SampleDateTime  AQI_Value  Location
0  2024-01-29 13:21:29.771351         61  New York
1  2024-01-30 01:21:29.771351         86  New York
2  2024-01-30 13:21:29.771351         39  New York
3  2024-01-31 01:21:29.771351        120  New York
4  2024-01-31 13:21:29.771351         32  New York
6  2024-02-01 13:21:29.771351        122  New York
7  2024-02-02 01:21:29.771351         35  New York
8                         NaT        124  New York
9  2024-02-03 01:21:29.771351         40  New York
10 2024-02-03 13:21:29.771351        140  New York
11 2024-02-04 01:21:29.771351         65  New York
12 2024-02-04 13:21:29.771351         47  New York
13 2024-02-05 01:21:29.771351        125  New York


##### Add Data 

In [37]:
def categorize_aqi(aqi):
    if aqi <= 50:
        return 'Good'
    elif aqi <= 100:
        return 'Moderate'
    elif aqi <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif aqi <= 200:
        return 'Unhealthy'
    elif aqi <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

df['AQI_Category'] = df['AQI_Value'].apply(categorize_aqi)
print(df)

               SampleDateTime  AQI_Value  Location  \
0  2024-01-29 13:21:29.771351         61  New York   
1  2024-01-30 01:21:29.771351         86  New York   
2  2024-01-30 13:21:29.771351         39  New York   
3  2024-01-31 01:21:29.771351        120  New York   
4  2024-01-31 13:21:29.771351         32  New York   
6  2024-02-01 13:21:29.771351        122  New York   
7  2024-02-02 01:21:29.771351         35  New York   
8                         NaT        124  New York   
9  2024-02-03 01:21:29.771351         40  New York   
10 2024-02-03 13:21:29.771351        140  New York   
11 2024-02-04 01:21:29.771351         65  New York   
12 2024-02-04 13:21:29.771351         47  New York   
13 2024-02-05 01:21:29.771351        125  New York   

                      AQI_Category  
0                         Moderate  
1                         Moderate  
2                             Good  
3   Unhealthy for Sensitive Groups  
4                             Good  
6   Unhealthy for Sen