## Import libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Initial EDA

In [79]:
# define file name
filename = r'2023 Dataset\uswtdb_v5_3_20230113_rename.csv'

# read csv file
wind_turbine_df = pd.read_csv(filename)

# get information about the dataframe
wind_turbine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72669 entries, 0 to 72668
Data columns (total 27 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   case_id                             72669 non-null  int64  
 1   faa_digital_obstacle_id             67310 non-null  object 
 2   faa_obstruction_evaluation_id       67537 non-null  object 
 3   usgs_id                             37904 non-null  float64
 4   eia_id                              68347 non-null  float64
 5   state                               72669 non-null  object 
 6   county                              72669 non-null  object 
 7   state_county_id                     72669 non-null  int64  
 8   project_name                        72669 non-null  object 
 9   operational_year                    72113 non-null  float64
 10  project_turbine_num                 72669 non-null  int64  
 11  cumulative_capacity (MW)            69255

In [80]:
# check for missing values
wind_turbine_df.isnull().sum().sort_values(ascending=False)[lambda x: x > 0]

retrofit_year                    65958
usgs_id                          34765
location_verification_date        7959
faa_digital_obstacle_id           5359
faa_obstruction_evaluation_id     5132
max_height (m)                    4899
hub_height (m)                    4898
rotor_swept area (m2)             4840
rotor_diameter (m)                4840
model_name                        4710
manufacturer                      4573
turbine_capacity (KW)             4402
eia_id                            4322
cumulative_capacity (MW)          3414
operational_year                   556
dtype: int64

In [81]:
# check if the number of 0 is equal to the number of missing values in retrofit_year
# count number of 0 in retrofit (y/n) given that retrofit_year is null
wind_turbine_df[wind_turbine_df['retrofit_year'].isnull()]['retrofit (y/n)'].value_counts()\
== wind_turbine_df.isnull().sum().sort_values(ascending=False)[lambda x: x > 0][0]

0    True
Name: retrofit (y/n), dtype: bool

retrofit_year is empty as no retrofitting was done to the respective turbines

usgs_id probably not useful, defined as: "Unique identifier for cross-reference to the 2014 USGS turbine dataset" 

location_verification_date probably not useful, defined as: "Date of image used to visually verify turbine location. Note if source of image is NAIP, the month and day were set to 01/01." 

faa_digital_obstacle_id and faa_obstruction_evaluation_id probably not useful

In [82]:
# count the number of missing values in max_height (m) for every model_name
wind_turbine_df[wind_turbine_df['max_height (m)'].isnull()]['model_name'].value_counts(dropna=False)

NaN           4576
180            176
V17             49
NPS-100         24
M65             23
250KW            8
NW100            7
M Sojets         7
NM44             5
M108             3
GE 2.3-116       2
NM52             2
75/L7            2
E-3120           2
108              2
NM54             1
Micon            1
V7               1
M530             1
EW 1.5S          1
V34              1
60               1
120              1
Micon 100        1
V27              1
Zond             1
Name: model_name, dtype: int64

In [83]:
# create a list of model_name that has missing values in max_height (m)
model_name_list = wind_turbine_df[wind_turbine_df['max_height (m)'].isnull()]['model_name'].unique()

# check for non empty values in max_height (m) for every model_name in model_name_list
for model_name in model_name_list:
    non_empty_values = wind_turbine_df.loc[(wind_turbine_df['model_name'] == model_name) & 
                                           (wind_turbine_df['max_height (m)'].notnull())]

# print the number of rows in non_empty_values
print(non_empty_values.shape[0])

0


In [84]:
# count the number of missing values in hub_height (m) for every model_name
wind_turbine_df[wind_turbine_df['hub_height (m)'].isnull()]['model_name'].value_counts(dropna=False)

NaN             4576
180              176
V17               49
NPS-100           23
M65               23
250KW              8
M Sojets           7
NW100              7
NM44               5
M108               3
75/L7              2
E-3120             2
108                2
NM52               2
GE 2.3-116         2
V34                1
V27                1
Micon 100          1
120                1
60                 1
EW 1.5S            1
Silver Eagle       1
V7                 1
NM54               1
M530               1
Zond               1
Name: model_name, dtype: int64

In [85]:
# create a list of model_name that has missing values in hub_height (m)
model_name_list = wind_turbine_df[wind_turbine_df['hub_height (m)'].isnull()]['model_name'].unique()

# check for non empty values in hub_height (m) for every model_name in model_name_list
for model_name in model_name_list:
    non_empty_values = wind_turbine_df.loc[(wind_turbine_df['model_name'] == model_name) & 
                                           (wind_turbine_df['hub_height (m)'].notnull())]

# print the number of rows in non_empty_values
print(non_empty_values.shape[0])

0


In [86]:
# count the number of missing values in model_name for every manufacturer
wind_turbine_df[wind_turbine_df['model_name'].isnull()]['manufacturer'].value_counts(dropna=False)

NaN          4563
Bonus         132
GE Wind         5
Vestas          3
Endurance       2
Bora            2
Entegrity       1
NEG Micon       1
Seaforth        1
Name: manufacturer, dtype: int64

In [87]:
# create a list of manufacturer_name that has missing values in model_name
manufacturer_name_list = wind_turbine_df[wind_turbine_df['model_name'].isnull()]['manufacturer'].unique()

# check for non empty values in model_name for every manufacturer in manufacturer_name_list
for manufacturer_name in manufacturer_name_list:
    non_empty_values = wind_turbine_df.loc[(wind_turbine_df['manufacturer'] == manufacturer_name) & 
                                           (wind_turbine_df['model_name'].notnull())]
    
# print the number of rows in non_empty_values
print(non_empty_values.shape[0])

11


In [76]:
# print non_empty_values
non_empty_values[['manufacturer', 'model_name']]

Unnamed: 0,manufacturer,model_name
32592,Seaforth,AOC15/65
32593,Seaforth,AOC15/65
32594,Seaforth,AOC15/65
32595,Seaforth,AOC15/65
32596,Seaforth,AOC15/65
32597,Seaforth,AOC15/65
32603,Seaforth,AOC15/65
32604,Seaforth,AOC15/65
32605,Seaforth,AOC15/65
71016,Seaforth,AOC15/65


In [88]:
# print unique values for model_name for manufacturer = 'Seaforth'
wind_turbine_df.loc[wind_turbine_df['manufacturer'] == 'Seaforth']['model_name'].value_counts(dropna=False)

AOC15/65    11
NaN          1
Name: model_name, dtype: int64

In [111]:
# replace missing values in model_name with the value in manufacturer = 'Seaforth' with model_name = 'AOC15/65'
wind_turbine_df.loc[(wind_turbine_df['manufacturer'] == 'Seaforth') &
                    (wind_turbine_df['model_name'].isnull()), 'model_name'] = 'AOC15/65'

# check the change in manufacturer = 'Seaforth'
wind_turbine_df.loc[wind_turbine_df['manufacturer'] == 'Seaforth']['model_name'].value_counts(dropna=False)

AOC15/65    12
Name: model_name, dtype: int64

In [78]:
# count the number of missing values in manufacturer for every model_name
wind_turbine_df[wind_turbine_df['manufacturer'].isnull()]['model_name'].value_counts(dropna=False)

NaN               4563
Danwind 23/160      10
Name: model_name, dtype: int64

In [106]:
# count the number of 'Danwind 23/160' owned by each manufacturer
wind_turbine_df[wind_turbine_df['model_name'] == 'Danwind 23/160']['manufacturer'].value_counts(dropna=False)

NaN    10
Name: manufacturer, dtype: int64

In [107]:
# count the number of missing values in turbine_capcity (KW) for every model
wind_turbine_df[wind_turbine_df['turbine_capacity (KW)'].isnull()]['model_name'].value_counts(dropna=False)

NaN      4401
NW100       1
Name: model_name, dtype: int64

In [108]:
# show the turbine_capacity (KW) for model_name = 'NW100'
wind_turbine_df[wind_turbine_df['model_name'] == 'NW100']['turbine_capacity (KW)'].value_counts(dropna=False)

100.0    58
NaN       1
Name: turbine_capacity (KW), dtype: int64

In [110]:
# replace the missing values in turbine_capacity (KW) with the value in model_name = 'NW100' with turbine_capacity (KW) = 100
wind_turbine_df.loc[(wind_turbine_df['model_name'] == 'NW100') &
                    (wind_turbine_df['turbine_capacity (KW)'].isnull()), 'turbine_capacity (KW)'] = 100

# check the change in turbine_capacity (KW) for model_name = 'NW100'
wind_turbine_df[wind_turbine_df['model_name'] == 'NW100']['turbine_capacity (KW)'].value_counts(dropna=False)

100.0    59
Name: turbine_capacity (KW), dtype: int64

eia_id probably not useful, defined as: "Plant ID from Energy Information Administration (EIA)" 

In [112]:
# obtain value counts for project_name
wind_turbine_df['project_name'].value_counts(dropna=False)

unknown Tehachapi Wind Resource Area 1    713
Stateline Wind Project                    454
Mesa Wind Farm                            432
Western Spirit                            377
Traverse Wind                             323
                                         ... 
Gaines Cavern Wind Project                  1
Mount St. Mary's Abbey                      1
GobNob                                      1
Golden Acorn Casino                         1
Scituate Wind                               1
Name: project_name, Length: 1736, dtype: int64

In [117]:
# check if the cumulative_capacity (KW) is equal across the same project_name
wind_turbine_df.groupby('project_name')['cumulative_capacity (MW)'].nunique().sort_values(ascending=False)[lambda x: x > 0]

project_name
McNeilus                4
South Hurlburt          4
Wilton Wind Farm        4
Crow Lake               3
Shepherds Flat          3
                       ..
Green Mountain Power    1
Gratiot County          1
Gratiot                 1
Grant Wind              1
Grant Plains            1
Name: cumulative_capacity (MW), Length: 1548, dtype: int64

In [122]:
# show the unique values for cumulative_capacity (MW) for project_name = 'McNeilus'
pd.DataFrame(wind_turbine_df[wind_turbine_df['project_name'] == 'McNeilus']['cumulative_capacity (MW)'].unique(), columns=['cumulative_capacity (MW)'])

Unnamed: 0,cumulative_capacity (MW)
0,9.0
1,22.8
2,9.9
3,1.65


In [154]:
# check if the turbine_num is equal across the same project_name
wind_turbine_df.groupby('project_name')['project_turbine_num'].nunique().sort_values(ascending=False)[lambda x: x > 0]

project_name
San Gorgonio Farms Wind Farm    5
McNeilus                        4
Twin Groves II                  3
Mars Hill                       3
Seven Mile Hill I & II          3
                               ..
Hidalgo                         1
Heritage Garden                 1
Heritage Flight                 1
Hereford 1                      1
unknown Yuma County             1
Name: project_turbine_num, Length: 1736, dtype: int64

In [125]:
# show the unique values for turbine number for project_name = 'McNeilus'
pd.DataFrame(wind_turbine_df[wind_turbine_df['project_name'] == 'McNeilus']['project_turbine_num'].unique(), columns=['project_turbine_num'])

Unnamed: 0,project_turbine_num
0,6
1,24
2,10
3,1


In [152]:
# show unique values for project_name equals to 'McNeilus'
wind_turbine_df[wind_turbine_df['project_name'] == 'McNeilus']\
[['eia_id','project_name', 'project_turbine_num', 'cumulative_capacity (MW)', 'turbine_capacity (KW)']]\
.drop_duplicates()

Unnamed: 0,eia_id,project_name,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW)
38306,56054.0,McNeilus,6,9.0,1500.0
38307,56054.0,McNeilus,24,22.8,950.0
38310,56307.0,McNeilus,6,9.9,1650.0
38317,56054.0,McNeilus,10,9.0,900.0
38319,56054.0,McNeilus,1,1.65,1650.0


In [155]:
# check if the project_turbine_num equals to the count of each eia_id
wind_turbine_df.groupby('eia_id')['project_turbine_num'].nunique()[lambda x: x > 0].sort_values(ascending=False)


eia_id
57833.0    6
57294.0    5
56608.0    4
7526.0     4
57291.0    4
          ..
57214.0    1
57239.0    1
57240.0    1
57244.0    1
65525.0    1
Name: project_turbine_num, Length: 1255, dtype: int64

In [149]:
# show the unique values for turbine number for eia_id = 57833.0
wind_turbine_df[wind_turbine_df['eia_id'] == 57833.0]\
    [['eia_id','project_name', 'project_turbine_num', 'cumulative_capacity (MW)', 'turbine_capacity (KW)']]\
    .drop_duplicates()

Unnamed: 0,eia_id,project_name,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW)
394,57833.0,Alta II,50,150.0,3000.0
487,57833.0,Alta IV,34,102.0,3000.0
556,57833.0,Alta V,56,168.0,3000.0
602,57833.0,Alta VI (partial),44,132.0,3000.0
645,57833.0,Alta VIII (partial),43,129.0,3000.0
42034,57833.0,Mustang Hills,6,18.0,3000.0


In [157]:
# check if the cumulative_capacity (MW) equals to the count of each eia_id
wind_turbine_df.groupby('eia_id')['cumulative_capacity (MW)'].nunique()[lambda x: x > 0].sort_values(ascending=False)

eia_id
57833.0    6
57294.0    5
6304.0     4
57291.0    4
56357.0    4
          ..
57239.0    1
57240.0    1
57244.0    1
57251.0    1
65525.0    1
Name: cumulative_capacity (MW), Length: 1254, dtype: int64

In [172]:
# concatenate the eia_id and project_name
wind_turbine_df['eia_id_project_name'] = wind_turbine_df['eia_id'].astype(str) + '_' + wind_turbine_df['project_name'] #+ '_' + wind_turbine_df['location_verification_date'].astype(str)

# show the unique values for eia_id_project_name
wind_turbine_df.groupby('eia_id_project_name')['cumulative_capacity (MW)'].nunique()[lambda x: x > 1].sort_values(ascending=False)

eia_id_project_name
57526.0_South Hurlburt                  4
56357.0_Wilton Wind Farm                4
56252.0_Century Expansion               3
56925.0_Crystal Lake II                 3
56843.0_Seven Mile Hill I & II          3
                                       ..
57195.0_Lower Snake River Phase I       2
50281.0_San Gorgonio Farms Wind Farm    2
57296.0_Carleton College                2
57332.0_Crossroads                      2
nan_unknown Riverside County            2
Name: cumulative_capacity (MW), Length: 66, dtype: int64

In [173]:
# show the unique values for eia_id_project_name equals to '56357.0_Wilton Wind Farm'
wind_turbine_df[wind_turbine_df['eia_id_project_name'] == '57526.0_South Hurlburt']\
[['eia_id_project_name','eia_id','project_name', 'project_turbine_num', 'cumulative_capacity (MW)', 'turbine_capacity (KW)', 'operational_year']]\
.drop_duplicates()

Unnamed: 0,eia_id_project_name,eia_id,project_name,project_turbine_num,cumulative_capacity (MW),turbine_capacity (KW),operational_year
58696,57526.0_South Hurlburt,57526.0,South Hurlburt,82,224.75,2750.0,2011.0
58697,57526.0_South Hurlburt,57526.0,South Hurlburt,23,63.25,2750.0,2012.0
58699,57526.0_South Hurlburt,57526.0,South Hurlburt,82,224.75,2500.0,2011.0
58757,57526.0_South Hurlburt,57526.0,South Hurlburt,116,290.0,2500.0,2012.0
58803,57526.0_South Hurlburt,57526.0,South Hurlburt,82,205.0,2500.0,2011.0


In [167]:
wind_turbine_df.columns

Index(['case_id', 'faa_digital_obstacle_id', 'faa_obstruction_evaluation_id',
       'usgs_id', 'eia_id', 'state', 'county', 'state_county_id',
       'project_name', 'operational_year', 'project_turbine_num',
       'cumulative_capacity (MW)', 'manufacturer', 'model_name',
       'turbine_capacity (KW)', 'hub_height (m)', 'rotor_diameter (m)',
       'rotor_swept area (m2)', 'max_height (m)', 'retrofit (y/n)',
       'retrofit_year', 'attribute_confidence_level (1/2/3)',
       'location_confidence_level (1/2/3)', 'location_verification_date',
       'image_source', 'xlong', 'ylat', 'eia_id_project_name'],
      dtype='object')

In [197]:
pd.set_option('display.max_columns', None)
wind_turbine_df[wind_turbine_df['eia_id_project_name'] == '57526.0_South Hurlburt']\
[['eia_id',
 'project_name',
 'project_turbine_num',
 'cumulative_capacity (MW)',
 'retrofit (y/n)',
 'retrofit_year',
 'eia_id_project_name']].drop_duplicates().sort_values(by=['cumulative_capacity (MW)'], ascending=False)

Unnamed: 0,eia_id,project_name,project_turbine_num,cumulative_capacity (MW),retrofit (y/n),retrofit_year,eia_id_project_name
58757,57526.0,South Hurlburt,116,290.0,0,,57526.0_South Hurlburt
58696,57526.0,South Hurlburt,82,224.75,1,2021.0,57526.0_South Hurlburt
58803,57526.0,South Hurlburt,82,205.0,1,2021.0,57526.0_South Hurlburt
58697,57526.0,South Hurlburt,23,63.25,1,2021.0,57526.0_South Hurlburt


In [176]:
# list of all the columns
list(wind_turbine_df.columns)

['case_id',
 'faa_digital_obstacle_id',
 'faa_obstruction_evaluation_id',
 'usgs_id',
 'eia_id',
 'state',
 'county',
 'state_county_id',
 'project_name',
 'operational_year',
 'project_turbine_num',
 'cumulative_capacity (MW)',
 'manufacturer',
 'model_name',
 'turbine_capacity (KW)',
 'hub_height (m)',
 'rotor_diameter (m)',
 'rotor_swept area (m2)',
 'max_height (m)',
 'retrofit (y/n)',
 'retrofit_year',
 'attribute_confidence_level (1/2/3)',
 'location_confidence_level (1/2/3)',
 'location_verification_date',
 'image_source',
 'xlong',
 'ylat',
 'eia_id_project_name']