#### Rex Gayas DSC350-T301 Data Wrangling for Data Scienc (2243-1)
#### Term Project Milestone 4 18 FEB 2024

In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the webpage to scrape
url = 'https://waterdata.usgs.gov/nwis/uv?cb_00010=on&cb_00010=on&cb_00095=on&cb_00095=on&cb_00300=on&cb_00300=on&cb_00301=on&cb_00400=on&cb_00400=on&cb_62619=on&cb_62620=on&cb_63680=on&cb_63680=on&cb_90860=on&format=html&site_no=01374019&legacy=1&period=&begin_date=2024-02-12&end_date=2024-02-19'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table with the data using class selectors
data_table = soup.find('table', class_='tablesorter')

# Extract the headers
headers = [header.get_text() for header in data_table.find_all('th')]

# Extract data from each row of the table
data = []
for row in data_table.find_all('tr'):
    cols = row.find_all('td')
    cols_text = [col.get_text() for col in cols]
    if cols_text:
        data.append(cols_text)

# Convert the list of data into a pandas DataFrame
df = pd.DataFrame(data, columns=headers)

# Print the cleaned DataFrame
print(df)

               Date / Time\n      Dis- solvedoxygen,  mg/L,[HRECOS]  \
0     02/12/2024 00:00   EST                                13.8P     
1     02/12/2024 00:15   EST                                13.8P     
2     02/12/2024 00:30   EST                                13.7P     
3     02/12/2024 00:45   EST                                13.7P     
4     02/12/2024 01:00   EST                                13.7P     
..                       ...                                    ...   
750   02/19/2024 19:30   EST                                13.6P     
751   02/19/2024 19:45   EST                                13.6P     
752   02/19/2024 20:00   EST                                13.6P     
753   02/19/2024 20:15   EST                                13.6P     
754   02/19/2024 20:30   EST                                13.6P     

    Estuary   or ocean elev- ation, NAVD,  feet,[HRECOS]  \
0                                             -4.23P       
1                          

##### Replace Headers

In [23]:
df.columns = ['Date and Time', 'Dissolved Oxygen (mg/L)', 'Elevation (ft)', 'Water Temperature (°C)', 'Conductance (µS/cm)', 'Turbidity (FNU)', 'pH Level', 'Oxygen Saturation (%)', 'Salinity (psu)']
print(df)

               Date and Time Dissolved Oxygen (mg/L) Elevation (ft)  \
0     02/12/2024 00:00   EST                 13.8P         -4.23P     
1     02/12/2024 00:15   EST                 13.8P         -4.27P     
2     02/12/2024 00:30   EST                 13.7P         -4.39P     
3     02/12/2024 00:45   EST                 13.7P         -4.50P     
4     02/12/2024 01:00   EST                 13.7P         -4.62P     
..                       ...                     ...            ...   
750   02/19/2024 19:30   EST                 13.6P                    
751   02/19/2024 19:45   EST                 13.6P                    
752   02/19/2024 20:00   EST                 13.6P                    
753   02/19/2024 20:15   EST                 13.6P                    
754   02/19/2024 20:30   EST                 13.6P                    

    Water Temperature (°C) Conductance (µS/cm) Turbidity (FNU) pH Level  \
0                   2.2P                474P           76.2P     8.1P   

##### Format Data Into a More Readable Format

In [24]:
import re

# Define a function to clean the numeric values
def clean_numeric(value):
    if isinstance(value, str):
        # Remove all non-numeric characters except the decimal point
        cleaned_value = re.sub(r'[^\d.]+', '', value)
        return float(cleaned_value) if cleaned_value else None
    return value

# List of columns that need to be cleaned
columns_to_clean = [
    'Dissolved Oxygen (mg/L)', 'Elevation (ft)', 'Water Temperature (°C)',
    'Conductance (µS/cm)', 'Turbidity (FNU)', 'pH Level',
    'Oxygen Saturation (%)', 'Salinity (psu)'
]

# Apply the cleaning function to each column
for column in columns_to_clean:
    df[column] = df[column].apply(clean_numeric)

# Check the first few rows of the dataframe to verify the changes
print(df[columns_to_clean].head())


   Dissolved Oxygen (mg/L)  Elevation (ft)  Water Temperature (°C)  \
0                     13.8            4.23                     2.2   
1                     13.8            4.27                     2.2   
2                     13.7            4.39                     2.2   
3                     13.7            4.50                     2.2   
4                     13.7            4.62                     2.2   

   Conductance (µS/cm)  Turbidity (FNU)  pH Level  Oxygen Saturation (%)  \
0                474.0             76.2       8.1                  100.0   
1                496.0             65.2       8.1                  100.0   
2                535.0             82.2       8.0                  100.0   
3                596.0             84.0       8.0                  100.0   
4                574.0             66.1       8.0                  100.0   

   Salinity (psu)  
0             0.2  
1             0.2  
2             0.3  
3             0.3  
4             0.3  


##### Identify Outliers and Bad Data

In [26]:
# Define a function to detect outliers in a column
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)].copy()

# List of numerical columns to check for outliers
numerical_columns = [
    'Dissolved Oxygen (mg/L)', 'Elevation (ft)', 'Water Temperature (°C)',
    'Conductance (µS/cm)', 'Turbidity (FNU)', 'pH Level',
    'Oxygen Saturation (%)', 'Salinity (psu)'
]

# Create an empty DataFrame to store outliers
outliers_df = pd.DataFrame()

# Loop through each numerical column to detect outliers
for column in numerical_columns:
    current_outliers = detect_outliers(df, column)
    # Add a column to distinguish outliers from different features
    current_outliers['Outlier_In'] = column
    # Append to the outliers DataFrame
    outliers_df = pd.concat([outliers_df, current_outliers], ignore_index=True)

# Display the outliers DataFrame
print("Detected outliers:")
print(outliers_df)


Detected outliers:
               Date and Time  Dissolved Oxygen (mg/L)  Elevation (ft)  \
0     02/12/2024 00:00   EST                     13.8            4.23   
1     02/12/2024 00:15   EST                     13.8            4.27   
2     02/12/2024 00:30   EST                     13.7            4.39   
3     02/12/2024 00:45   EST                     13.7            4.50   
4     02/12/2024 01:00   EST                     13.7            4.62   
..                       ...                      ...             ...   
394   02/12/2024 16:15   EST                     13.7            0.96   
395   02/12/2024 16:30   EST                     13.7            0.72   
396   02/12/2024 16:45   EST                     13.7            0.38   
397   02/12/2024 17:00   EST                     13.7            0.05   
398   02/12/2024 17:15   EST                     13.7            0.28   

     Water Temperature (°C)  Conductance (µS/cm)  Turbidity (FNU)  pH Level  \
0                       2

##### Find Duplicates

In [27]:
# Find and display any duplicate rows
duplicate_rows = df[df.duplicated()]
print("Duplicate rows:")
print(duplicate_rows)

# Remove the duplicate rows
df = df.drop_duplicates()

# Reset the index of the DataFrame after removing duplicates
df.reset_index(drop=True, inplace=True)

# Display the shape of the DataFrame after duplicates have been removed
print(f"DataFrame shape after removing duplicates: {df.shape}")


Duplicate rows:
Empty DataFrame
Columns: [Date and Time, Dissolved Oxygen (mg/L), Elevation (ft), Water Temperature (°C), Conductance (µS/cm), Turbidity (FNU), pH Level, Oxygen Saturation (%), Salinity (psu)]
Index: []
DataFrame shape after removing duplicates: (755, 9)


No duplicates found.

##### Add Data

In [28]:
# Define a threshold for Dissolved Oxygen below which the water is considered contaminated
contamination_threshold = 5  

# Add a new column 'Contamination_Flag' where 1 indicates contamination and 0 indicates no contamination
df['Contamination_Flag'] = (df['Dissolved Oxygen (mg/L)'] < contamination_threshold).astype(int)

# Verify the new column
print(df[['Dissolved Oxygen (mg/L)', 'Contamination_Flag']].head())


   Dissolved Oxygen (mg/L)  Contamination_Flag
0                     13.8                   0
1                     13.8                   0
2                     13.7                   0
3                     13.7                   0
4                     13.7                   0
