In [1]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/refs/heads/master/data/2021/2021-07-20/drought.csv"
data = pd.read_csv(url)

In [2]:
# Step 1: Convert dates to datetime and extract year for grouping
data['valid_start'] = pd.to_datetime(data['valid_start'])
data['year'] = data['valid_start'].dt.year

In [3]:
# Here's an example lookup dictionary with some state abbreviations and their FIPS codes
state_fips_mapping = {
    'AK': 2, 'AL': 1, 'AR': 5, 'AZ': 4, 'CA': 6, 'CO': 8, 'CT': 9, 'DE': 10, 
    'FL': 12, 'GA': 13, 'HI': 15, 'ID': 16, 'IL': 17, 'IN': 18, 'IA': 19,
    # Continue with all states
    'WI': 55, 'WV': 54, 'WY': 56
}

# Step 1: Map state abbreviations to FIPS codes using the lookup dictionary
data['state_fips'] = data['state_abb'].map(state_fips_mapping)


In [4]:
data.dropna(subset=['state_fips', 'drought_lvl', 'area_pct'], inplace=True)

In [5]:
# Step 4: Prepare data for "All Years," "Decade 1," and "Decade 2" aggregation
# Aggregate by state and drought level for each time range

# "All Years" (2001-2021)
all_years_summary = data.groupby(['state_fips', 'drought_lvl']).agg({
    'area_pct': 'mean'
}).reset_index()
all_years_summary['time_range'] = '2001-2021'

# "Decade 1" (2001-2010)
decade_1 = data[(data['year'] >= 2001) & (data['year'] <= 2010)]
decade_1_summary = decade_1.groupby(['state_fips', 'drought_lvl']).agg({
    'area_pct': 'mean'
}).reset_index()
decade_1_summary['time_range'] = '2001-2010'

# "Decade 2" (2011-2021)
decade_2 = data[(data['year'] >= 2011) & (data['year'] <= 2021)]
decade_2_summary = decade_2.groupby(['state_fips', 'drought_lvl']).agg({
    'area_pct': 'mean'
}).reset_index()
decade_2_summary['time_range'] = '2011-2021'

In [6]:
# Combine summaries for filtering in Vega-Lite
combined_summary = pd.concat([all_years_summary, decade_1_summary, decade_2_summary], ignore_index=True)


In [8]:
# Step 5: Prepare data for pie chart (proportions of drought levels)
pie_data = data.groupby(['drought_lvl', 'time_range']).agg({
    'area_pct': 'mean'
}).reset_index()

KeyError: 'time_range'

In [11]:
# Step 1: Filter out rows where drought level is "None"
data = data[data['drought_lvl'] != 'None']

# Step 2: Add the 'time_range' column to the main dataset based on year ranges

# "All Years" (2001-2021)
data['time_range'] = '2001-2021'

# Set time_range for Decade 1 (2001-2010)
data.loc[(data['year'] >= 2001) & (data['year'] <= 2010), 'time_range'] = '2001-2010'

# Set time_range for Decade 2 (2011-2021)
data.loc[(data['year'] >= 2011) & (data['year'] <= 2021), 'time_range'] = '2011-2021'

# Step 3: Now proceed with preparing pie chart data
pie_data = data.groupby(['drought_lvl', 'time_range']).agg({
    'area_pct': 'mean'
}).reset_index()

# Verify the structure of pie_data
print(pie_data.head())


  drought_lvl time_range   area_pct
0          D0  2001-2010  17.562799
1          D0  2011-2021  16.241132
2          D1  2001-2010  10.690936
3          D1  2011-2021  10.252975
4          D2  2001-2010   8.253439


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['time_range'] = '2001-2021'


In [12]:
# Step 6: Prepare data for trend line chart (2001-2021 yearly trends)
trend_data = data.groupby(['year', 'drought_lvl']).agg({
    'area_pct': 'mean'
}).reset_index()


In [13]:
# Export datasets for visualization
combined_summary.to_csv('combined_drought_summary.csv', index=False)
pie_data.to_csv('pie_drought_data.csv', index=False)
trend_data.to_csv('trend_drought_data.csv', index=False)

In [14]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/SwaroopG28/Drought-Dataset---IV-Assignment/refs/heads/main/combined_drought_summary.csv"
data = pd.read_csv(url)

# Convert 'state_fips' column to integer
data['state_fips'] = data['state_fips'].fillna(0).astype(int)

# Check to ensure conversion
print(data['state_fips'].head())

# Export to a CSV file on your local machine
data.to_csv("cleaned_drought_data.csv", index=False)
print("Data has been exported as 'cleaned_drought_data.csv'")


0    1
1    1
2    1
3    1
4    1
Name: state_fips, dtype: int32
Data has been exported as 'cleaned_drought_data.csv'
