## Import libraries

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data Preprocessing

In [3]:
# define file name
filename = r'Use this dataset\uswtdb_v5_0_20220427_rename.csv'

# read csv file
wind_turbine_df = pd.read_csv(filename)

# check number of rows and columns
wind_turbine_df.shape

  wind_turbine_df = pd.read_csv(filename)


(71784, 124)

In [4]:
# get the columns of the dataframe
wind_turbine_df.columns

Index(['Plant Id', 'Combined Heat And\nPower Plant', 'Nuclear Unit Id',
       'Plant Name', 'Operator Name', 'Operator Id', 'Plant State',
       'Census Region', 'NERC Region', 'Reserved',
       ...
       'rotor_swept area (m2)', 'max_height (m)', 'retrofit (y/n)',
       'retrofit_year', 'attribute_confidence_level (1/2/3)',
       'location_confidence_level (1/2/3)', 'location_verification_date',
       'image_source', 'xlong', 'ylat'],
      dtype='object', length=124)

In [5]:
# filter unecessary columns
wind_turbine_df = wind_turbine_df[[#'faa_digital_obstacle_id', 'faa_obstruction_evaluation_id', 'usgs_id', 
                                   'Plant Id', 'eia_id', 
                                   # 'state', 'county',
                                   # 'state_county_id',
                                   'project_name', 
                                   # 'operational_year', 'project_turbine_num',
                                   # 'cumulative_capacity (MW)',
                                   # 'manufacturer', 'model_name', 
                                   'turbine_capacity (KW)',
                                   'hub_height (m)', 
                                   'rotor_diameter (m)', 'rotor_swept area (m2)', 'max_height (m)',
                                   # 'retrofit (y/n)',
                                   # 'retrofit_year', 'attribute_confidence_level (1/2/3)',
                                   # 'location_confidence_level (1/2/3)', 'location_verification_date', 'image_source', 
                                   'xlong', 'ylat']]

# check for duplicates
wind_turbine_df.duplicated().sum()

1

In [6]:
# show duplicated rows
wind_turbine_df[wind_turbine_df.duplicated()]

Unnamed: 0,Plant Id,eia_id,project_name,turbine_capacity (KW),hub_height (m),rotor_diameter (m),rotor_swept area (m2),max_height (m),xlong,ylat
63989,99999.0,,,,,,,,,


In [7]:
# drop Plant Id = 99999.0
wind_turbine_df = wind_turbine_df[wind_turbine_df['Plant Id'] != 99999.0]

# check for duplicates
wind_turbine_df.duplicated().sum()

0

In [8]:
# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

Plant Id                 7794
eia_id                   6205
max_height (m)           5139
hub_height (m)           5138
rotor_diameter (m)       4893
rotor_swept area (m2)    4893
turbine_capacity (KW)    4439
project_name              116
xlong                     116
ylat                      116
dtype: int64

In [9]:
# drop for Plant Id that is null
wind_turbine_df = wind_turbine_df[wind_turbine_df['Plant Id'].notna()]

# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

max_height (m)           1507
hub_height (m)           1506
rotor_diameter (m)       1270
rotor_swept area (m2)    1270
turbine_capacity (KW)     871
eia_id                    116
project_name              116
xlong                     116
ylat                      116
dtype: int64

In [10]:
# show values for eia_id that are null
wind_turbine_df[wind_turbine_df['eia_id'].isnull()]

Unnamed: 0,Plant Id,eia_id,project_name,turbine_capacity (KW),hub_height (m),rotor_diameter (m),rotor_swept area (m2),max_height (m),xlong,ylat
0,1.0,,,,,,,,,
1433,52142.0,,,,,,,,,
1506,54299.0,,,,,,,,,
1507,54300.0,,,,,,,,,
1547,54647.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
63983,65059.0,,,,,,,,,
63984,65143.0,,,,,,,,,
63985,65478.0,,,,,,,,,
63986,65479.0,,,,,,,,,


In [11]:
# remove for eia_id that are null
wind_turbine_df = wind_turbine_df[wind_turbine_df['eia_id'].notna()]

# check null values
wind_turbine_df.isnull().sum()[lambda x: x > 0].sort_values(ascending=False)

max_height (m)           1391
hub_height (m)           1390
rotor_diameter (m)       1154
rotor_swept area (m2)    1154
turbine_capacity (KW)     755
dtype: int64

In [12]:
# check if all Plant Id = eia_id
wind_turbine_df[wind_turbine_df['Plant Id'] != wind_turbine_df['eia_id']]

Unnamed: 0,Plant Id,eia_id,project_name,turbine_capacity (KW),hub_height (m),rotor_diameter (m),rotor_swept area (m2),max_height (m),xlong,ylat


In [14]:
# replace project_name = "Lake Region Community Hybrid, LLC" with operational_year = 2020, turbine_capacity (KW) = 2500
# https://www.albertleatribune.com/2020/08/county-board-gives-go-ahead-for-new-wind-turbine-project/ - proj details
# https://eerscmap.usgs.gov/uswtdb/viewer/#10.02/43.6746/-93.3486
wind_turbine_df.loc[wind_turbine_df['project_name'] == 'Lake Region Community Hybrid, LLC', 'turbine_capacity (KW)'] = [2500]

# check project_name = "Lake Region Community Hybrid, LLC"
wind_turbine_df[wind_turbine_df['project_name'] == 'Lake Region Community Hybrid, LLC']

Unnamed: 0,Plant Id,eia_id,project_name,turbine_capacity (KW),hub_height (m),rotor_diameter (m),rotor_swept area (m2),max_height (m),xlong,ylat
57401,62442.0,62442.0,"Lake Region Community Hybrid, LLC",2500.0,,,,,-93.307907,43.580891


In [109]:
# save file to csv, removing '_rename.csv' to '_proj_capacity.csv'
wind_turbine_df.to_csv(filename.replace('_rename.csv', '_proj_capacity.csv'), index=False)