## Import libraries

In [1]:
import pandas as pd

## Change wind turbine dataset column names

In [5]:
# define windturbine dataset file name
wind_turbine_filename = r'Use this dataset/uswtdb_v5_0_20220427.csv'

# read wind turbine dataset
wind_turbine_df = pd.read_csv(wind_turbine_filename, encoding = 'Latin-1')

# print first 5 rows of wind turbine dataset
wind_turbine_df.head()

Unnamed: 0,case_id,faa_ors,faa_asn,usgs_pr_id,eia_id,t_state,t_county,t_fips,p_name,p_year,...,t_rsa,t_ttlh,retrofit,retrofit_year,t_conf_atr,t_conf_loc,t_img_date,t_img_srce,xlong,ylat
0,3072695,,,5143.0,52161.0,CA,Kern County,6029,251 Wind,1987.0,...,,,0,,2,3,5/8/2018,Digital Globe,-118.36441,35.077435
1,3072704,,,5146.0,52161.0,CA,Kern County,6029,251 Wind,1987.0,...,,,0,,2,3,5/8/2018,Digital Globe,-118.364197,35.077644
2,3072661,,,5149.0,52161.0,CA,Kern County,6029,251 Wind,1987.0,...,,,0,,2,3,5/8/2018,Digital Globe,-118.363762,35.077908
3,3063269,19-028130,2016-WTE-5934-OE,,,IA,Story County,19169,30 MW Iowa DG Portfolio,2017.0,...,12271.85,150.0,0,,3,3,7/23/2017,Digital Globe,-93.632835,41.882477
4,3063321,19-028135,2014-WTE-4087-OE,,,IA,Hardin County,19083,30 MW Iowa DG Portfolio,2017.0,...,12271.85,150.0,0,,3,3,6/20/2017,Digital Globe,-93.367798,42.49794


In [8]:
wind_turbine_df.shape

(71666, 27)

In [10]:
# define datadictionary file name
datadictionary_filename = 'data_documentation/windturbine_datadictionary_updated.xlsx'

# read datadictionary dataset
datadictionary_df = pd.read_excel(datadictionary_filename)

# print first 5 rows of datadictionary dataset
datadictionary_df.head()

Unnamed: 0,Key,Value Type,Key Description,Column Name,url
0,case_id,number (integer),Unique stable identification number.,case_id,
1,faa_ors,string,Unique identifier for cross-reference to the F...,faa_digital_obstacle_id,https://www.faa.gov/air_traffic/flight_info/ae...
2,faa_asn,string,Unique identifier for cross-reference to the F...,faa_obstruction_evaluation_id,https://www.faa.gov/air_traffic/obstruction_ev...
3,usgs_pr_id,number (integer),Unique identifier for cross-reference to the 2...,usgs_id,
4,t_state,string,State where turbine is located.,state,


In [11]:
# set 'Key' column as index in the datadictionary_df dataframe
datadictionary_df = datadictionary_df.set_index('Key')

# use rename function to map old column names to new column names using the datadictionary_df dataframe
wind_turbine_df_rename = wind_turbine_df.rename(columns=datadictionary_df['Column Name'])

# reset index of datadictionary_df
datadictionary_df = datadictionary_df.reset_index()

# print first 5 rows of wind turbine dataset
wind_turbine_df_rename.head()

Unnamed: 0,case_id,faa_digital_obstacle_id,faa_obstruction_evaluation_id,usgs_id,eia_id,state,county,state_county_id,project_name,operational_year,...,rotor_swept area (m2),max_height (m),retrofit (y/n),retrofit_year,attribute_confidence_level (1/2/3),location_confidence_level (1/2/3),location_verification_date,image_source,xlong,ylat
0,3072695,,,5143.0,52161.0,CA,Kern County,6029,251 Wind,1987.0,...,,,0,,2,3,5/8/2018,Digital Globe,-118.36441,35.077435
1,3072704,,,5146.0,52161.0,CA,Kern County,6029,251 Wind,1987.0,...,,,0,,2,3,5/8/2018,Digital Globe,-118.364197,35.077644
2,3072661,,,5149.0,52161.0,CA,Kern County,6029,251 Wind,1987.0,...,,,0,,2,3,5/8/2018,Digital Globe,-118.363762,35.077908
3,3063269,19-028130,2016-WTE-5934-OE,,,IA,Story County,19169,30 MW Iowa DG Portfolio,2017.0,...,12271.85,150.0,0,,3,3,7/23/2017,Digital Globe,-93.632835,41.882477
4,3063321,19-028135,2014-WTE-4087-OE,,,IA,Hardin County,19083,30 MW Iowa DG Portfolio,2017.0,...,12271.85,150.0,0,,3,3,6/20/2017,Digital Globe,-93.367798,42.49794


## Validate new column names

In [12]:
# create a new df to validate the renaming of the columns
rename_validation = pd.DataFrame(wind_turbine_df.columns, columns=['wind_turbine_df'])

# add the column names of wind_turbine_df_rename
rename_validation['wind_turbine_df_rename'] = wind_turbine_df_rename.columns

# maps column names in datadictionary_df to their corresponding 'Key' values
key_column = datadictionary_df.set_index('Key')['Column Name']

# add the corresponding 'Column Name' values based on the 'Key' values in wind_turbine_df
rename_validation['datadictionary'] = rename_validation['wind_turbine_df'].map(key_column)

# check if the column names in wind_turbine_df_rename match the corresponding column names in datadictionary_df
rename_validation['check'] = rename_validation['wind_turbine_df_rename'] == rename_validation['datadictionary']

# print the check of the renamed columns
print(f'Total number of columns: {rename_validation.shape[0]}\n\
Number of columns equal: {rename_validation[rename_validation["check"] == True].shape[0]}\n\
Number of columns not equal: {rename_validation[rename_validation["check"] == False].shape[0]}')

Total number of columns: 27
Number of columns equal: 27
Number of columns not equal: 0


# Deal with missing data

In [15]:
wind_turbine_df.isnull().sum()

case_id              0
faa_ors           5501
faa_asn           5161
usgs_pr_id       33437
eia_id            6089
t_state              0
t_county             0
t_fips               0
p_name               0
p_year             563
p_tnum               0
p_cap             3335
t_manu            4493
t_model           4630
t_cap             4323
t_hh              5022
t_rd              4777
t_rsa             4777
t_ttlh            5023
retrofit             0
retrofit_year    65680
t_conf_atr           0
t_conf_loc           0
t_img_date        8294
t_img_srce           0
xlong                0
ylat                 0
dtype: int64

In [20]:
#percentage of values we are dropping
6089/len(wind_turbine_df)*100

8.496358105656796

In [18]:
wind_turbine_df[wind_turbine_df['eia_id'].isnull()].isnull().sum()

case_id             0
faa_ors          2203
faa_asn          1354
usgs_pr_id       4691
eia_id           6089
t_state             0
t_county            0
t_fips              0
p_name              0
p_year            561
p_tnum              0
p_cap            3332
t_manu           3404
t_model          3538
t_cap            3394
t_hh             3453
t_rd             3446
t_rsa            3446
t_ttlh           3453
retrofit            0
retrofit_year    6087
t_conf_atr          0
t_conf_loc          0
t_img_date         40
t_img_srce          0
xlong               0
ylat                0
dtype: int64

## Save to csv file

In [13]:
# save the renamed wind turbine dataset to a new csv file
wind_turbine_df_rename.to_csv(wind_turbine_filename.split('.')[0] + '_rename.csv', index=False)