## Import libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data Preprocessing

In [14]:
# define file name
filename = r'Use this dataset\uswtdb_v5_0_20220427_rename.csv'

# read csv file
wind_turbine_df = pd.read_csv(filename)

# check number of rows and columns
wind_turbine_df.shape

  wind_turbine_df = pd.read_csv(filename)


(71787, 124)

In [15]:
# get the columns of the dataframe
wind_turbine_df.columns

Index(['Plant Id', 'Combined Heat And\nPower Plant', 'Nuclear Unit Id',
       'Plant Name', 'Operator Name', 'Operator Id', 'Plant State',
       'Census Region', 'NERC Region', 'Reserved',
       ...
       'rotor_swept area (m2)', 'max_height (m)', 'retrofit (y/n)',
       'retrofit_year', 'attribute_confidence_level (1/2/3)',
       'location_confidence_level (1/2/3)', 'location_verification_date',
       'image_source', 'xlong', 'ylat'],
      dtype='object', length=124)

In [16]:
# filter unecessary columns
wind_turbine_df = wind_turbine_df[[#'faa_digital_obstacle_id', 'faa_obstruction_evaluation_id', 'usgs_id', 
                                   'Plant Id', 'eia_id', 'state', 'county',
                                   # 'state_county_id',
                                   'project_name', 'operational_year', 'project_turbine_num',
                                   'cumulative_capacity (MW)',
                                   # 'manufacturer', 'model_name', 
                                   'turbine_capacity (KW)',
                                   # 'hub_height (m)', 
                                   # 'rotor_diameter (m)', 'rotor_swept area (m2)', 'max_height (m)', 'retrofit (y/n)',
                                   # 'retrofit_year', 'attribute_confidence_level (1/2/3)',
                                   # 'location_confidence_level (1/2/3)', 'location_verification_date', 'image_source', 
                                   'xlong', 'ylat']]

# check for duplicates
wind_turbine_df.duplicated().sum()

1

In [17]:
# show duplicated rows
wind_turbine_df[wind_turbine_df.duplicated()]

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat
64538,99999.0,,,,,,,,,,


In [20]:
# drop Plant Id = 99999.0
wind_turbine_df = wind_turbine_df[wind_turbine_df['Plant Id'] != 99999.0]

# check for duplicates
wind_turbine_df.duplicated().sum()

0

In [22]:
# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

Plant Id                    7248
eia_id                      6208
turbine_capacity (KW)       4442
cumulative_capacity (MW)    3454
operational_year             682
state                        119
county                       119
project_name                 119
project_turbine_num          119
xlong                        119
ylat                         119
dtype: int64

In [23]:
# drop for Plant Id that is null
wind_turbine_df = wind_turbine_df[wind_turbine_df['Plant Id'].notna()]

# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

turbine_capacity (KW)       874
operational_year            121
cumulative_capacity (MW)    121
eia_id                      119
state                       119
county                      119
project_name                119
project_turbine_num         119
xlong                       119
ylat                        119
dtype: int64

In [24]:
# show values for eia_id that are null
wind_turbine_df[wind_turbine_df['eia_id'].isnull()]

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat
0,1.0,,,,,,,,,,
44,2240.0,,,,,,,,,,
1869,52142.0,,,,,,,,,,
1942,54299.0,,,,,,,,,,
1943,54300.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
64532,65143.0,,,,,,,,,,
64533,65316.0,,,,,,,,,,
64534,65478.0,,,,,,,,,,
64535,65479.0,,,,,,,,,,


In [25]:
# remove for eia_id that are null
wind_turbine_df = wind_turbine_df[wind_turbine_df['eia_id'].notna()]

# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

turbine_capacity (KW)       755
operational_year              2
cumulative_capacity (MW)      2
dtype: int64

In [26]:
# check if all Plant Id = eia_id
wind_turbine_df[wind_turbine_df['Plant Id'] != wind_turbine_df['eia_id']]

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat


In [27]:
# show values for operational_year that are null
wind_turbine_df[wind_turbine_df['operational_year'].isnull()]

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat
977,50532.0,50532.0,CA,Kern County,Historic Victory Garden (Wind Stream),,215.0,17.0,,-118.375799,35.074235
1021,50532.0,50532.0,CA,Kern County,Historic Victory Garden (Wind Stream),,215.0,17.0,,-118.37561,35.074281


In [28]:
# replace project_name = "Historic Victory Garden (Wind Stream)" with operational_year = 1982
# http://www.windstreamproperties.com/victory-garden.html - proj details
# https://eerscmap.usgs.gov/uswtdb/viewer/#13.39/35.06889/-118.38267 - proj operational date
wind_turbine_df.loc[wind_turbine_df['project_name'] == 'Historic Victory Garden (Wind Stream)', 'operational_year'] = 1982

# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

turbine_capacity (KW)       755
cumulative_capacity (MW)      2
dtype: int64

In [29]:
# check for cumulative_capacity (MW) that are null
wind_turbine_df[wind_turbine_df['cumulative_capacity (MW)'].isnull()]

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat
57947,62442.0,62442.0,MN,Freeborn County,"Lake Region Community Hybrid, LLC",2018.0,1.0,,,-93.307907,43.580891
59477,62642.0,62642.0,TX,Lubbock County,unknown Lubbock County,2015.0,6.0,,,-102.056847,33.591496


In [32]:
# replace project_name = "Lake Region Community Hybrid, LLC" with operational_year = 2020, turbine_capacity (KW) = 2500
# https://www.albertleatribune.com/2020/08/county-board-gives-go-ahead-for-new-wind-turbine-project/ - proj details
# https://eerscmap.usgs.gov/uswtdb/viewer/#10.02/43.6746/-93.3486
wind_turbine_df.loc[wind_turbine_df['project_name'] == 'Lake Region Community Hybrid, LLC', ['operational_year', 'turbine_capacity (KW)']] = [2020, 2500]

# check project_name = "Lake Region Community Hybrid, LLC"
wind_turbine_df[wind_turbine_df['project_name'] == 'Lake Region Community Hybrid, LLC']

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat
57947,62442.0,62442.0,MN,Freeborn County,"Lake Region Community Hybrid, LLC",2020.0,1.0,,2500.0,-93.307907,43.580891


In [33]:
# add a column for project_turbine_capacity (MW) by summing turbine_capacity (KW) grouped by project_name
wind_turbine_df['project_turbine_capacity (MW)'] = wind_turbine_df.groupby('project_name')['turbine_capacity (KW)'].transform('sum') / 1000

# add a column for 2021_project_num by counting number of rows grouped by project_name
wind_turbine_df['2021_project_turbine_num'] = wind_turbine_df.groupby('project_name')['project_name'].transform('count')

# print first 5 rows
wind_turbine_df.head()

Unnamed: 0,Plant Id,eia_id,state,county,project_name,operational_year,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),xlong,ylat,project_turbine_capacity (MW),2021_project_turbine_num
1,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.426483,64.571999,0.75,15
2,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.427277,64.571198,0.75,15
3,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.43368,64.565158,0.75,15
4,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.425278,64.572205,0.75,15
5,90.0,90.0,AK,Nome Census Area,Nome,2008.0,18.0,0.9,50.0,-165.43338,64.563202,0.75,15


In [None]:
# show eia_id = 62642
wind_turbine_df[wind_turbine_df['eia_id'] == 62642]

In [None]:
# convert cumulative_capacity (MW) to integer
wind_turbine_df['cumulative_capacity (MW)'] = wind_turbine_df['cumulative_capacity (MW)'].astype('int64')

In [None]:
# filter for both cumulative_capacity (MW) and turbine_capacity (KW)  = null
wind_turbine_df[wind_turbine_df['cumulative_capacity (MW)'].isnull() & wind_turbine_df['turbine_capacity (KW)'].isnull()].shape[0]


In [None]:
# show only turbine_capacity (KW) = null and cumulative_capacity (MW) = null
wind_turbine_df[(wind_turbine_df['turbine_capacity (KW)'].isnull()) & (wind_turbine_df['cumulative_capacity (MW)'].isnull())]

In [None]:
# filter eia_id = 62642.0

In [None]:
# show turbine_capacity (KW) = null and cumulative_capacity (MW) != null
wind_turbine_df[(wind_turbine_df['turbine_capacity (KW)'].isnull()) & (wind_turbine_df['cumulative_capacity (MW)'].notnull())].shape

In [None]:
# show eia_id = 10597.0 and project_turbine_num = 236.0	
wind_turbine_df[(wind_turbine_df['eia_id'] == 10597.0) & (wind_turbine_df['project_turbine_num'] == 236.0)]

In [None]:
# check if the project_turbine_num equals to the count of each eia_id
wind_turbine_df.groupby('eia_id')['project_turbine_num'].nunique()[lambda x: x > 0].sort_values(ascending=False)


In [None]:
# show the unique values for turbine number for eia_id = 57833.0
wind_turbine_df[wind_turbine_df['eia_id'] == 57833.0]\
    [['eia_id','project_name', 'project_turbine_num', 'cumulative_capacity (MW)', 'turbine_capacity (KW)']]\
    .drop_duplicates()

In [None]:
# check if the cumulative_capacity (MW) equals to the count of each eia_id
wind_turbine_df.groupby('eia_id')['cumulative_capacity (MW)'].nunique()[lambda x: x > 0].sort_values(ascending=False)

In [None]:
# concatenate the eia_id and project_name
wind_turbine_df['eia_id_project_name'] = wind_turbine_df['eia_id'].astype(str) + '_' + wind_turbine_df['project_name'] #+ '_' + wind_turbine_df['location_verification_date'].astype(str)

# show the unique values for eia_id_project_name
wind_turbine_df.groupby('eia_id_project_name')['cumulative_capacity (MW)'].nunique()[lambda x: x > 1].sort_values(ascending=False)

In [None]:
# show the unique values for eia_id_project_name equals to '56357.0_Wilton Wind Farm'
wind_turbine_df[wind_turbine_df['eia_id_project_name'] == '57526.0_South Hurlburt']\
[['eia_id_project_name','eia_id','project_name', 'project_turbine_num', 'cumulative_capacity (MW)', 'turbine_capacity (KW)', 'operational_year']]\
.drop_duplicates()

In [None]:
wind_turbine_df.columns

In [None]:
pd.set_option('display.max_columns', None)
wind_turbine_df[wind_turbine_df['eia_id_project_name'] == '57526.0_South Hurlburt']\
[['eia_id',
 'project_name',
 'project_turbine_num',
 'cumulative_capacity (MW)',
 'retrofit (y/n)',
 'retrofit_year',
 'eia_id_project_name']].drop_duplicates().sort_values(by=['cumulative_capacity (MW)'], ascending=False)

In [None]:
# list of all the columns
list(wind_turbine_df.columns)