In [3]:
import pandas as pd

file_path = './data/raw/USGS.csv'

# Load only the first 10,000
df_sample = pd.read_csv(file_path, sep='\t', nrows=10000)

# Display the first 5 rows of our sample
print("--- First 5 Rows ---")
print(df_sample.head())

# Display the column names
print("\n--- Column Names ---")
print(df_sample.columns)

# Display basic info and memory usage
print("\n--- Data Info ---")
df_sample.info()

--- First 5 Rows ---
       gbifID                            datasetKey occurrenceID   kingdom  \
0  3351232530  d8cd16ba-bb74-4420-821e-083f2bac17c2     KX834062  Animalia   
1  3350546437  d8cd16ba-bb74-4420-821e-083f2bac17c2     KP747142  Animalia   
2  3350523583  d8cd16ba-bb74-4420-821e-083f2bac17c2     KM537872  Animalia   
3  3346653217  d8cd16ba-bb74-4420-821e-083f2bac17c2     GU707474  Animalia   
4  3346673384  d8cd16ba-bb74-4420-821e-083f2bac17c2     GU707475  Animalia   

       phylum    class        order  family       genus  \
0  Arthropoda  Insecta  Hymenoptera  Apidae  Anthophora   
1  Arthropoda  Insecta  Hymenoptera  Apidae    Ceratina   
2  Arthropoda  Insecta  Hymenoptera  Apidae  Triepeolus   
3  Arthropoda  Insecta  Hymenoptera  Apidae    Ceratina   
4  Arthropoda  Insecta  Hymenoptera  Apidae    Ceratina   

                 species  ...        identifiedBy dateIdentified    license  \
0   Anthophora villosula  ...           J. Straka            NaN  CC_BY_4_0 

In [4]:
# Define the list of columns we want to keep
columns_to_keep = [
    'gbifID',
    'kingdom',
    'family',
    'genus',
    'species',
    'scientificName',
    'decimalLatitude',
    'decimalLongitude',
    'stateProvince',
    'eventDate',
    'year',
    'month',
    'day'
]
# Create an iterator to read the large file in chunks
chunk_iterator = pd.read_csv(
    file_path,
    sep='\t',
    usecols=columns_to_keep,
    chunksize=100000,
    dtype={'gbifID': 'int64', 'species': 'str'} 
)

# Combine the chunks into a single DataFrame
df_bees = pd.concat(chunk_iterator)

# Display the info of our new, smaller DataFrame
print("--- Cleaned Data Info ---")
df_bees.info()

# Display the first 5 rows of the new DataFrame
print("\n--- First 5 Rows of Cleaned Data ---")
print(df_bees.head())

--- Cleaned Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611266 entries, 0 to 611265
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   gbifID            611266 non-null  int64  
 1   kingdom           611266 non-null  object 
 2   family            611266 non-null  object 
 3   genus             607642 non-null  object 
 4   species           557705 non-null  object 
 5   scientificName    611266 non-null  object 
 6   stateProvince     606538 non-null  object 
 7   decimalLatitude   611266 non-null  float64
 8   decimalLongitude  611266 non-null  float64
 9   eventDate         611266 non-null  object 
 10  day               427828 non-null  float64
 11  month             521065 non-null  float64
 12  year              611266 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 60.6+ MB

--- First 5 Rows of Cleaned Data ---
       gbifID   kingdom  family       genu

In [6]:
# Handle Missing Location Data
df_bees.dropna(subset=['stateProvince', 'decimalLatitude', 'decimalLongitude'], inplace=True)

# Convert to Datetime and Handle Date Errors
df_bees['eventDate'] = pd.to_datetime(df_bees['eventDate'], errors='coerce')

# Drop any rows where the date conversion failed
df_bees.dropna(subset=['eventDate'], inplace=True)

# Filter for a Relevant Time Period
df_bees = df_bees[df_bees['year'] >= 2000].copy()

# Final Check
print("--- Info After Cleaning ---")
df_bees.info()

print("\n--- First 5 Rows of Final Cleaned Data ---")
print(df_bees.head())

print("\n--- Unique States Found ---")
print(df_bees['stateProvince'].unique())

--- Info After Cleaning ---
<class 'pandas.core.frame.DataFrame'>
Index: 318498 entries, 28 to 611214
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   gbifID            318498 non-null  int64         
 1   kingdom           318498 non-null  object        
 2   family            318498 non-null  object        
 3   genus             315520 non-null  object        
 4   species           284150 non-null  object        
 5   scientificName    318498 non-null  object        
 6   stateProvince     318498 non-null  object        
 7   decimalLatitude   318498 non-null  float64       
 8   decimalLongitude  318498 non-null  float64       
 9   eventDate         318498 non-null  datetime64[ns]
 10  day               318498 non-null  float64       
 11  month             318498 non-null  float64       
 12  year              318498 non-null  int64         
dtypes: datetime64[ns](1), float64(4), i

In [7]:
# Correct Taxonom
# Remove the obvious non-bee records like 'Crustacea'.
df_bees = df_bees[df_bees['genus'] != 'Crustacea'].copy()


# Standardize State Names
state_mapping = {
    'colorado': 'Colorado', 'Colroado': 'Colorado', 'Coloraod': 'Colorado', 'Coloraodo': 'Colorado', 'COlorado': 'Colorado', 'Coorado': 'Colorado',
    'wyoming': 'Wyoming',
    'South Dacota': 'South Dakota', 'South Daokta': 'South Dakota',
    'MIssissippi': 'Mississippi',
    'iowa': 'Iowa',
    'New hampshire': 'New Hampshire',
    'massachusetts': 'Massachusetts',
    'Vt': 'Vermont',
    'Il': 'Illinois',
    'Doc': 'District of Columbia', 'District Of Columbia': 'District of Columbia'
}

# Apply the mapping to fix the specific errors
df_bees['stateProvince'] = df_bees['stateProvince'].replace(state_mapping)

# Convert the whole column to title case to fix general capitalization issues
df_bees['stateProvince'] = df_bees['stateProvince'].str.title()

# Remove any rows with invalid state entries
invalid_states = ['[Not Stated]', 'Nan'] # 'Nan' can sometimes appear after cleaning
df_bees = df_bees[~df_bees['stateProvince'].isin(invalid_states)]

# Final Check ---
print("--- Info After Final Standardization ---")
df_bees.info()

print("\n--- Unique States After Standardization ---")
print(sorted(df_bees['stateProvince'].unique()))

--- Info After Final Standardization ---
<class 'pandas.core.frame.DataFrame'>
Index: 318457 entries, 641 to 611214
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   gbifID            318457 non-null  int64         
 1   kingdom           318457 non-null  object        
 2   family            318457 non-null  object        
 3   genus             315479 non-null  object        
 4   species           284149 non-null  object        
 5   scientificName    318457 non-null  object        
 6   stateProvince     318457 non-null  object        
 7   decimalLatitude   318457 non-null  float64       
 8   decimalLongitude  318457 non-null  float64       
 9   eventDate         318457 non-null  datetime64[ns]
 10  day               318457 non-null  float64       
 11  month             318457 non-null  float64       
 12  year              318457 non-null  int64         
dtypes: datetime64[ns](1),

In [8]:
# Fix the typo for 'Tennessee'
df_bees['stateProvince'] = df_bees['stateProvince'].replace({'Tennesee': 'Tennessee'})

# Save our cleaned data to a new, smaller CSV file.
df_bees.to_csv('cleaned_usgs_bee_data.csv', index=False)

print("--- Preprocessing Complete! ---")
print("Cleaned data has been saved to 'cleaned_usgs_bee_data.csv'")

--- Preprocessing Complete! ---
Cleaned data has been saved to 'cleaned_usgs_bee_data.csv'
