In [1]:
import pandas as pd

In [2]:
# Loading Lake Mead CSV file
lake_mead_load = "../Resources/Lake_Mead_Storage.csv"

# Reading the Lake Mead data
lake_mead_df = pd.read_csv(lake_mead_load)
lake_mead_df.head()


Unnamed: 0,datetime,storage
0,5/28/1937,10046048.3
1,5/29/1937,10138665.0
2,5/30/1937,10228958.7
3,5/31/1937,10302056.6
4,6/1/1937,10373112.5


In [3]:
# Get data types
print (lake_mead_df.dtypes)

datetime     object
storage     float64
dtype: object


In [4]:
# Convert the 'datetime' column to datetime format
lake_mead_df['datetime'] = pd.to_datetime(lake_mead_df['datetime'])
lake_mead_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31295 entries, 0 to 31294
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  31295 non-null  datetime64[ns]
 1   storage   31295 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 489.1 KB


In [5]:
# Extract month and year
lake_mead_df['year'] = lake_mead_df['datetime'].dt.year
lake_mead_df['month'] = lake_mead_df['datetime'].dt.month
lake_mead_df.head()

Unnamed: 0,datetime,storage,year,month
0,1937-05-28,10046048.3,1937,5
1,1937-05-29,10138665.0,1937,5
2,1937-05-30,10228958.7,1937,5
3,1937-05-31,10302056.6,1937,5
4,1937-06-01,10373112.5,1937,6


In [6]:
# Verify the 'datetime' conversion
lake_mead_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31295 entries, 0 to 31294
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  31295 non-null  datetime64[ns]
 1   storage   31295 non-null  float64       
 2   year      31295 non-null  int64         
 3   month     31295 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 978.1 KB


In [7]:
# Change the 'month' from a number to actual abbreviated name
lake_mead_df['month'] = pd.to_datetime(lake_mead_df['month'], format='%m').dt.month_name().str.slice(stop=3)
lake_mead_df.head()

Unnamed: 0,datetime,storage,year,month
0,1937-05-28,10046048.3,1937,May
1,1937-05-29,10138665.0,1937,May
2,1937-05-30,10228958.7,1937,May
3,1937-05-31,10302056.6,1937,May
4,1937-06-01,10373112.5,1937,Jun


In [8]:
# Reorder the dataframe columns
new_lake_mead_df = lake_mead_df.iloc[:,[3,2,1,0]]
new_lake_mead_df.head()

Unnamed: 0,month,year,storage,datetime
0,May,1937,10046048.3,1937-05-28
1,May,1937,10138665.0,1937-05-29
2,May,1937,10228958.7,1937-05-30
3,May,1937,10302056.6,1937-05-31
4,Jun,1937,10373112.5,1937-06-01


In [9]:
# Drop 'datetime' column
new_lake_mead_df = new_lake_mead_df.drop(['datetime'], axis=1)
new_lake_mead_df.head(10)

Unnamed: 0,month,year,storage
0,May,1937,10046048.3
1,May,1937,10138665.0
2,May,1937,10228958.7
3,May,1937,10302056.6
4,Jun,1937,10373112.5
5,Jun,1937,10446189.6
6,Jun,1937,10532678.7
7,Jun,1937,10630357.4
8,Jun,1937,10725404.3
9,Jun,1937,10808609.1


In [12]:
# For Lake Mead calculate the storage average by month and year
final_lake_mead_df = new_lake_mead_df.groupby(['year', 'month'], as_index=False).mean()

final_lake_mead_df.head(20)

Unnamed: 0,year,month,storage
0,1937,Aug,12360730.0
1,1937,Dec,11839440.0
2,1937,Jul,12262270.0
3,1937,Jun,11289880.0
4,1937,May,10178930.0
5,1937,Nov,11884180.0
6,1937,Oct,11978390.0
7,1937,Sep,12123470.0
8,1938,Apr,12368300.0
9,1938,Aug,20252190.0
