## Import libraries

In [69]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data Preprocessing

In [70]:
# define file name
filename = r'Use this dataset\uswtdb_v5_0_20220427_rename.csv'

# read csv file
wind_turbine_df = pd.read_csv(filename)

# check number of rows and columns
wind_turbine_df.shape

  wind_turbine_df = pd.read_csv(filename)


(71784, 124)

In [71]:
# get the columns of the dataframe
wind_turbine_df.columns

Index(['Plant Id', 'Combined Heat And\nPower Plant', 'Nuclear Unit Id',
       'Plant Name', 'Operator Name', 'Operator Id', 'Plant State',
       'Census Region', 'NERC Region', 'Reserved',
       ...
       'rotor_swept area (m2)', 'max_height (m)', 'retrofit (y/n)',
       'retrofit_year', 'attribute_confidence_level (1/2/3)',
       'location_confidence_level (1/2/3)', 'location_verification_date',
       'image_source', 'xlong', 'ylat'],
      dtype='object', length=124)

In [72]:
# filter unecessary columns
wind_turbine_df = wind_turbine_df[[#'faa_digital_obstacle_id', 'faa_obstruction_evaluation_id', 'usgs_id', 
                                   'Plant Id', 'eia_id', 'state', 'county',
                                   # 'state_county_id',
                                   'project_name', 'operational_year', 'project_turbine_num',
                                   'cumulative_capacity (MW)',
                                   # 'manufacturer', 'model_name', 
                                   'turbine_capacity (KW)',
                                   # 'hub_height (m)', 
                                   # 'rotor_diameter (m)', 'rotor_swept area (m2)', 'max_height (m)', 'retrofit (y/n)',
                                   # 'retrofit_year', 'attribute_confidence_level (1/2/3)',
                                   # 'location_confidence_level (1/2/3)', 'location_verification_date', 'image_source', 
                                   'xlong', 'ylat']]

# check for duplicates
wind_turbine_df.duplicated().sum()

1

In [73]:
# show duplicated rows
wind_turbine_df[wind_turbine_df.duplicated()]

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat
63989,99999.0,,,,,,,,,,


In [74]:
# drop Plant Id = 99999.0
wind_turbine_df = wind_turbine_df[wind_turbine_df['Plant Id'] != 99999.0]

# check for duplicates
wind_turbine_df.duplicated().sum()

0

In [75]:
# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

Plant Id                    7794
eia_id                      6205
turbine_capacity (KW)       4439
cumulative_capacity (MW)    3451
operational_year             679
state                        116
county                       116
project_name                 116
project_turbine_num          116
xlong                        116
ylat                         116
dtype: int64

In [76]:
# drop for Plant Id that is null
wind_turbine_df = wind_turbine_df[wind_turbine_df['Plant Id'].notna()]

# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

turbine_capacity (KW)       871
operational_year            118
cumulative_capacity (MW)    118
eia_id                      116
state                       116
county                      116
project_name                116
project_turbine_num         116
xlong                       116
ylat                        116
dtype: int64

In [77]:
# show values for eia_id that are null
wind_turbine_df[wind_turbine_df['eia_id'].isnull()]

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat
0,1.0,,,,,,,,,,
1433,52142.0,,,,,,,,,,
1506,54299.0,,,,,,,,,,
1507,54300.0,,,,,,,,,,
1547,54647.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
63983,65059.0,,,,,,,,,,
63984,65143.0,,,,,,,,,,
63985,65478.0,,,,,,,,,,
63986,65479.0,,,,,,,,,,


In [78]:
# remove for eia_id that are null
wind_turbine_df = wind_turbine_df[wind_turbine_df['eia_id'].notna()]

# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

turbine_capacity (KW)       755
operational_year              2
cumulative_capacity (MW)      2
dtype: int64

In [79]:
# check if all Plant Id = eia_id
wind_turbine_df[wind_turbine_df['Plant Id'] != wind_turbine_df['eia_id']]

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat


In [80]:
# show values for operational_year that are null
wind_turbine_df[wind_turbine_df['operational_year'].isnull()]

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat
973,50532.0,50532.0,CA,Kern County,Historic Victory Garden (Wind Stream),,215.0,17.0,,-118.375799,35.074235
1017,50532.0,50532.0,CA,Kern County,Historic Victory Garden (Wind Stream),,215.0,17.0,,-118.37561,35.074281


In [81]:
# replace project_name = "Historic Victory Garden (Wind Stream)" with operational_year = 1982
# http://www.windstreamproperties.com/victory-garden.html - proj details
# https://eerscmap.usgs.gov/uswtdb/viewer/#13.39/35.06889/-118.38267 - proj operational date
wind_turbine_df.loc[wind_turbine_df['project_name'] == 'Historic Victory Garden (Wind Stream)', 'operational_year'] = 1982

# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

turbine_capacity (KW)       755
cumulative_capacity (MW)      2
dtype: int64

In [82]:
# check for cumulative_capacity (MW) that are null
wind_turbine_df[wind_turbine_df['cumulative_capacity (MW)'].isnull()]

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat
57401,62442.0,62442.0,MN,Freeborn County,"Lake Region Community Hybrid, LLC",2018.0,1.0,,,-93.307907,43.580891
58931,62642.0,62642.0,TX,Lubbock County,unknown Lubbock County,2015.0,6.0,,,-102.056847,33.591496


In [83]:
# replace project_name = "Lake Region Community Hybrid, LLC" with operational_year = 2020, turbine_capacity (KW) = 2500
# https://www.albertleatribune.com/2020/08/county-board-gives-go-ahead-for-new-wind-turbine-project/ - proj details
# https://eerscmap.usgs.gov/uswtdb/viewer/#10.02/43.6746/-93.3486
wind_turbine_df.loc[wind_turbine_df['project_name'] == 'Lake Region Community Hybrid, LLC', ['operational_year', 'turbine_capacity (KW)']] = [2020, 2500]

# check project_name = "Lake Region Community Hybrid, LLC"
wind_turbine_df[wind_turbine_df['project_name'] == 'Lake Region Community Hybrid, LLC']

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat
57401,62442.0,62442.0,MN,Freeborn County,"Lake Region Community Hybrid, LLC",2020.0,1.0,,2500.0,-93.307907,43.580891


In [84]:
# add a column for project_turbine_capacity (MW) by summing turbine_capacity (KW) grouped by project_name
wind_turbine_df['project_turbine_capacity (MW)'] = wind_turbine_df.groupby('project_name')['turbine_capacity (KW)'].transform('sum') / 1000

# add a column for 2021_project_num by counting number of rows grouped by project_name
wind_turbine_df['2021_project_turbine_num'] = wind_turbine_df.groupby('project_name')['project_name'].transform('count')

# print first 5 rows
wind_turbine_df.head()

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat,project_turbine_capacity (MW),2021_project_turbine_num
1,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.426483,64.571999,0.75,15
2,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.427277,64.571198,0.75,15
3,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.43368,64.565158,0.75,15
4,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.425278,64.572205,0.75,15
5,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.43338,64.563202,0.75,15


In [85]:
# filter for project_name = 'Nome'
wind_turbine_df[wind_turbine_df['project_name'] == 'Nome']

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat,project_turbine_capacity (MW),2021_project_turbine_num
1,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.426483,64.571999,0.75,15
2,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.427277,64.571198,0.75,15
3,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.43368,64.565158,0.75,15
4,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.425278,64.572205,0.75,15
5,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.43338,64.563202,0.75,15
6,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.427475,64.569099,0.75,15
7,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.42868,64.567902,0.75,15
8,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.427475,64.571701,0.75,15
9,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.431473,64.562302,0.75,15
10,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.434866,64.563367,0.75,15


In [86]:
# count number of rows where project_turbine_num != 2021_project_num by project_name
wind_turbine_df[wind_turbine_df['project_turbine_num'] != wind_turbine_df['2021_project_turbine_num']].project_name.nunique()

234

In [104]:
# count number of unique values for cumulative_capacity (MW) and 'project_turbine_num' for each project_name
wind_turbine_df[['project_name', 'cumulative_capacity (MW)', 'project_turbine_num']].drop_duplicates()\
    .groupby('project_name').count().\
    sort_values(by='cumulative_capacity (MW)', ascending=False).head()

Unnamed: 0_level_0,cumulative_capacity (MW),project_turbine_num
project_name,Unnamed: 1_level_1,Unnamed: 2_level_1
McNeilus,5,5
Oak Creek Energy Systems,4,4
Mars Hill,3,3
Stateline Wind Project,3,3
Crow Lake,3,3


In [100]:
# unique rows for cumulative_capacity (MW) and project_turbine_num by project_name
unique_project_metrics = wind_turbine_df[['project_name', 'cumulative_capacity (MW)', 'project_turbine_num']]\
                        .drop_duplicates(subset=['project_name', 'cumulative_capacity (MW)', 'project_turbine_num'], keep='first')\
                        [['project_name', 'cumulative_capacity (MW)', 'project_turbine_num']]

# show project_name = 'McNeilus'
unique_project_metrics[unique_project_metrics['project_name'] == 'McNeilus']

Unnamed: 0,project_name,cumulative_capacity (MW),project_turbine_num
4874,McNeilus,22.8,24.0
4875,McNeilus,9.0,6.0
4883,McNeilus,9.0,10.0
4911,McNeilus,1.65,1.0
7824,McNeilus,9.9,6.0


In [102]:
# add a column for 2021_cumulative capcity (MW) by summing the unique cumulative_capacity (MW) grouped by project_name
# wind_turbine_df['2021_cumulative_capacity (MW)'] = unique_project_metrics.groupby('project_name')['cumulative_capacity (MW)'].transform('sum')

# check for project_name = 'McNeilus'
# wind_turbine_df[wind_turbine_df['project_name'] == 'McNeilus'].head(1)
# using 'McNeilus' for sanity check, it doesn't make sense for 1 additional turbine to add 22.80 MW, hence the MAx of Cumulative Capacity (MW) is used

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat,project_turbine_capacity (MW),2021_project_turbine_num,2021_cumulative_capacity (MW)
4874,56054.0,56054.0,MN,Dodge County,McNeilus,2003.0,24.0,22.8,950.0,-92.851089,43.978493,52.35,47,52.35


In [105]:
# show project_name = 'Oak Creek Energy Systems'
unique_project_metrics[unique_project_metrics['project_name'] == 'Oak Creek Energy Systems']

# Oak Creek Energy Systems further justify the use of the Max of Cumulative Capacity (MW) for 2021_cumulative_capacity (MW)

Unnamed: 0,project_name,cumulative_capacity (MW),project_turbine_num
1263,Oak Creek Energy Systems,5.95,8.0
1266,Oak Creek Energy Systems,0.9,12.0
1267,Oak Creek Energy Systems,1.296,12.0
1272,Oak Creek Energy Systems,0.07,1.0


In [107]:
# use the max of cumulative_capacity (MW) grouped by project_name
wind_turbine_df['2021_cumulative_capacity (MW)'] = wind_turbine_df.groupby('project_name')['cumulative_capacity (MW)'].transform('max')

# use the max of project_turbine_num grouped by project_name
wind_turbine_df['2021_project_turbine_num'] = wind_turbine_df.groupby('project_name')['project_turbine_num'].transform('max')

# check using Oak Creek Energy Systems
wind_turbine_df[wind_turbine_df['project_name'] == 'Oak Creek Energy Systems'].head(1)

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat,project_turbine_capacity (MW),2021_project_turbine_num,2021_cumulative_capacity (MW)
1263,50754.0,50754.0,CA,Kern County,Oak Creek Energy Systems,2002.0,8.0,5.95,1500.0,-118.364388,35.034294,6.423,12.0,5.95


In [108]:
# drop project_turbine_capcity (MW) as it is no longer needed
wind_turbine_df.drop(columns=['project_turbine_capacity (MW)'], inplace=True)

# check first row
wind_turbine_df.head(1)

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat,2021_project_turbine_num,2021_cumulative_capacity (MW)
1,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.426483,64.571999,18.0,0.9


In [109]:
# save file to csv, removing '_rename.csv' to '_proj_capacity.csv'
wind_turbine_df.to_csv(filename.replace('_rename.csv', '_proj_capacity.csv'), index=False)