In [None]:
%load_ext autoreload
%autoreload 2

Merged 6 CSV files into 'merged.csv'


In [1]:
import pandas as pd

df=pd.read_csv('data/example.csv')

In [3]:
df.dropna(subset=['post_text', 'Image'], inplace=True)


In [None]:
# import os
# import urllib.request

# folder = "Butterfly_images"
# os.makedirs(folder, exist_ok=True)

# for post_id, url in zip(df['PostID'], df['Image']):
#     file_path = os.path.join(folder, f"{post_id}.jpg")
#     urllib.request.urlretrieve(url, file_path)

In [4]:
from src.text_normalization import clean_text_full

# Full cleaning pipeline
df['post_text'] = df['post_text'].apply(
    lambda x: clean_text_full(
        x,
        remove_urls_flag=True,
        remove_emojis_flag=True,
        remove_hashtags=True,
        remove_punctuation_flag=True
    )
)

In [5]:
import pandas as pd
from src.date_extraction import process_dataframe_dates
from src.species_extraction import process_dataframe_species
df = process_dataframe_dates(df, text_column='post_text', date_column='extracted_date')
df = process_dataframe_species(
    df, 
    reference_csv_path='data/main_butterfly_list.csv',
    text_column='post_text'
)

In [7]:
df = df.dropna(subset=['Butterfly_Genus', 'Butterfly_Species', 'Butterfly_Common_Name'], how='all')


In [8]:
from src.species_extraction import fill_genus_species

fill_genus_species(df)

In [13]:
import regex as re
df['Butterfly_Species'] = df.apply(
    lambda row: 'sp.' 
        if (pd.isna(row['Butterfly_Species']) or row['Butterfly_Species'].strip() == '') 
        and not (pd.isna(row['Butterfly_Genus']) or row['Butterfly_Genus'].strip() == '') 
        and re.search(r'\bsp\.?\b', row['post_text'], flags=re.IGNORECASE) 
        else row['Butterfly_Species'], 
    axis=1
)

df = df.dropna(subset=['Butterfly_Genus', 'Butterfly_Species', 'Butterfly_Common_Name'], how='all')
df = df.dropna(subset=['Butterfly_Genus', 'Butterfly_Common_Name'], how='all')
df = df[~(df['Butterfly_Genus'].isna() & df['Butterfly_Species'].notna() & df['Butterfly_Common_Name'].notna())]


In [15]:
from src.location_extraction import process_dataframe_locations
df = process_dataframe_locations(
    df,
    geonames_filepath='data/BD.txt',
    text_column='post_text'
)


In [17]:
df['Common_Name_Clean'] = df['Butterfly_Common_Name'].str.lower().str.replace("-", " ").str.strip()

# Build a lookup dictionary: common_name -> unique (genus, species) tuples
lookup = (
    df.dropna(subset=['Butterfly_Genus','Butterfly_Species'])
      .groupby('Common_Name_Clean')[['Butterfly_Genus','Butterfly_Species']]
      .agg(lambda x: set(x))
)

# Function to fill empty genus/species if unambiguous
def fill_genus_species(row):
    if pd.isna(row['Butterfly_Genus']) and pd.isna(row['Butterfly_Species']):
        common_name = row['Common_Name_Clean']
        if common_name in lookup.index:
            genera = lookup.loc[common_name, 'Butterfly_Genus']
            species = lookup.loc[common_name, 'Butterfly_Species']
            if len(genera) == 1 and len(species) == 1:
                row['Butterfly_Genus'] = list(genera)[0]
                row['Butterfly_Species'] = list(species)[0]
    return row

# Apply function row-wise
df = df.apply(fill_genus_species, axis=1)

# Optional: drop helper column
df = df.drop(columns=['Common_Name_Clean'])


In [19]:

# Drop duplicates based on 'post_text' AND 'extracted_date'
df = df.drop_duplicates(subset=['post_text', 'extracted_date'], keep='first')

In [23]:
df['Butterfly_Species'] = df['Butterfly_Species'].fillna('sp.')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Butterfly_Species'] = df['Butterfly_Species'].fillna('sp.')


In [24]:
df = df.dropna(subset=['Butterfly_Common_Name','Butterfly_Genus'])

# Reset the index
df = df.reset_index(drop=True)

In [25]:
import pandas as pd
import re
from dateutil.parser import parse


df['extracted_date'] = df['extracted_date'].astype(str).str.replace("'", " ", regex=False)





months = {
    'jan':1, 'january':1,
    'feb':2, 'february':2,
    'mar':3, 'march':3,
    'apr':4, 'april':4,
    'may':5,
    'jun':6, 'june':6,
    'jul':7, 'july':7,
    'aug':8, 'august':8,
    'sep':9, 'sept':9, 'september':9,
    'oct':10, 'october':10,
    'nov':11, 'november':11,
    'dec':12, 'december':12
}

def extract_month_year(text):
    text = str(text).lower().strip()
    
    # Remove ordinal suffixes like st, nd, rd, th
    text = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', text)
    
    month = "Unknown"
    year = "Unknown"
    
    # 1️⃣ Match optional day + month + optional year
    match = re.search(r'(?:\d{1,2}[\s,.-]*)?([a-z]+)[\s,.-]*(\d{2,4})?', text)
    if match:
        month_str, year_str = match.groups()
        month = months.get(month_str[:3], "Unknown")
        
        if year_str:
            year = int(year_str)
            if year < 100:  # 2-digit → 20XX
                year += 2000
    
    # 2️⃣ Only check numbers if NO month exists
    if month == "Unknown":
        numbers = re.findall(r'\b\d{2,4}\b', text)
        if len(numbers) == 1:
            num = int(numbers[0])
            if num < 100:
                year = 2000 + num
            else:
                year = num
            if not (2010 < year < 2025):
                year = "Unknown"
    
    # 3️⃣ If both still unknown, try dateutil.parser
    if month == "Unknown" and year == "Unknown":
        try:
            dt = parse(text, fuzzy=True, dayfirst=True)
            month = dt.month
            year = dt.year
        except:
            pass
    
    return pd.Series([month, year])



# Apply to your existing column
df[['month', 'year']] = df['extracted_date'].apply(extract_month_year)
df['year'] = pd.to_numeric(df['year'], errors='coerce')

# Drop rows where year < 2000
df = df[df['year'].isna() | (df['year'] >= 2000)]
df['year'] = df['year'].fillna("Unknown")


new_order = ['PostID', 'post_text', 'extracted_date', 'month', 'year',
             'Butterfly_Genus', 'Butterfly_Species', 'Butterfly_Common_Name',
             'Location', 'lat', 'lon']
df = df[new_order]


In [None]:
import pandas as pd
df=pd.read_csv('finalbutterfly.csv')