In [1]:
import pandas as pd

# Load data from CSV files
df_usa = pd.read_csv('C:\\Users\\shahr\\Downloads\\database_USA.csv')
df_india = pd.read_csv('C:\\Users\\shahr\\Downloads\\database_IND (1).csv')
df_aus = pd.read_csv('C:\\Users\\shahr\\Downloads\\database_AUS.csv')


In [2]:
# Combine datasets
df = pd.concat([df_usa, df_india, df_aus], ignore_index=True)


In [3]:
# Display the first few rows of the combined DataFrame
print(df.head())


  country              country_long                            name  \
0     USA  United States of America  10 Briggs Solar NG  LLC (East)   
1     USA  United States of America      100 Brook Hill Drive Solar   
2     USA  United States of America      1001 Ebenezer Church Solar   
3     USA  United States of America             1008 Matthews Solar   
4     USA  United States of America               1009 Yadkin Solar   

    gppd_idnr  capacity_mw  latitude  longitude primary_fuel other_fuel1  \
0  USA0062781          0.0   41.6327   -71.4963          NaN         NaN   
1  USA0063292          2.0   41.0930   -73.9828        Solar         NaN   
2  USA0063444          0.0   36.2838   -80.8434          NaN         NaN   
3  USA0063447          0.0   36.2167   -80.5525          NaN         NaN   
4  USA0063445          0.0   36.1475   -80.6561          NaN         NaN   

  other_fuel2  ... year_of_capacity_data  generation_gwh_2013  \
0         NaN  ...                2019.0           

In [4]:
# Check for missing values
print(df.isnull().sum())

# Get data types of columns
print(df.dtypes)

# Statistical summary
print(df.describe())


country                         0
country_long                    0
name                            0
gppd_idnr                       0
capacity_mw                     6
latitude                       78
longitude                      78
primary_fuel                 2004
other_fuel1                 11670
other_fuel2                 13028
other_fuel3                 13206
commissioning_year           2644
owner                         593
source                          0
url                             0
geolocation_source             23
wepp_id                     13298
year_of_capacity_data         438
generation_gwh_2013          6128
generation_gwh_2014          5764
generation_gwh_2015          5305
generation_gwh_2016          4597
generation_gwh_2017          4006
generation_gwh_2018          3440
generation_gwh_2019          3520
generation_data_source       2463
estimated_generation_gwh    13298
dtype: int64
country                      object
country_long                 obje

In [5]:
# Define target variables
target_primary_fuel = df['primary_fuel']
target_capacity_mw = df['capacity_mw']


In [6]:
# Example: Handling missing values by imputing with median
df['capacity_mw'].fillna(df['capacity_mw'].median(), inplace=True)


In [7]:
# Example: Remove rows with missing primary fuel information
df.dropna(subset=['primary_fuel'], inplace=True)


In [8]:
# Example: Create a new feature 'age' based on commissioning year
current_year = 2024  # Update with current year
df['age'] = current_year - df['commissioning_year']


In [9]:
# Example: Selecting relevant features
features = ['country', 'capacity_mw', 'primary_fuel', 'commissioning_year', 'owner']
df = df[features]


In [10]:
# Example: One-hot encoding categorical variables
df = pd.get_dummies(df, columns=['country', 'primary_fuel'])


In [12]:
print(df.columns)


Index(['capacity_mw', 'commissioning_year', 'owner', 'country_AUS',
       'country_IND', 'country_USA', 'primary_fuel_Biomass',
       'primary_fuel_Coal', 'primary_fuel_Cogeneration', 'primary_fuel_Gas',
       'primary_fuel_Geothermal', 'primary_fuel_Hydro', 'primary_fuel_Nuclear',
       'primary_fuel_Oil', 'primary_fuel_Other', 'primary_fuel_Petcoke',
       'primary_fuel_Solar', 'primary_fuel_Storage', 'primary_fuel_Waste',
       'primary_fuel_Wind'],
      dtype='object')


In [13]:
# Example: Load and concatenate data again (if needed)
df_usa = pd.read_csv('C:\\Users\\shahr\\Downloads\\database_USA.csv')
df_india = pd.read_csv('C:\\Users\\shahr\\Downloads\\database_IND (1).csv')
df_aus = pd.read_csv('C:\\Users\\shahr\\Downloads\\database_AUS.csv')

df = pd.concat([df_usa, df_india, df_aus], ignore_index=True)


In [14]:
# Example: One-hot encoding categorical variables
df_encoded = pd.get_dummies(df, columns=['country', 'owner'])


In [15]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop(['primary_fuel', 'capacity_mw'], axis=1)  # Features
y_primary_fuel = df_encoded['primary_fuel']  # Target: Primary Fuel
y_capacity_mw = df_encoded['capacity_mw']  # Target: Capacity (MW)

# Split data for Primary Fuel prediction
X_train_pf, X_test_pf, y_train_pf, y_test_pf = train_test_split(X, y_primary_fuel, test_size=0.2, random_state=42)

# Split data for Capacity (MW) prediction
X_train_cmw, X_test_cmw, y_train_cmw, y_test_cmw = train_test_split(X, y_capacity_mw, test_size=0.2, random_state=42)


In [19]:
from sklearn.model_selection import train_test_split

# Assuming df_encoded is your DataFrame with encoded features and targets
X = df_encoded.drop(['primary_fuel', 'capacity_mw'], axis=1)  # Features
y_primary_fuel = df_encoded['primary_fuel']  # Target: Primary Fuel
y_capacity_mw = df_encoded['capacity_mw']  # Target: Capacity (MW)

# Split data for Primary Fuel prediction
X_train_pf, X_test_pf, y_train_pf, y_test_pf = train_test_split(X, y_primary_fuel, test_size=0.2, random_state=42)

# Split data for Capacity (MW) prediction
X_train_cmw, X_test_cmw, y_train_cmw, y_test_cmw = train_test_split(X, y_capacity_mw, test_size=0.2, random_state=42)


In [22]:
print(X_train_pf.columns)
print(X_test_pf.columns)
print(X_train_cmw.columns)
print(X_test_cmw.columns)


Index(['country_long', 'name', 'gppd_idnr', 'latitude', 'longitude',
       'other_fuel1', 'other_fuel2', 'other_fuel3', 'commissioning_year',
       'source',
       ...
       'owner_Zamil New Delhi Infrastructure Private Limited',
       'owner_Zapco Energy Tactics Corp', 'owner_Zeeland Farm Services',
       'owner_Zero Waste Energy Development Company LLC',
       'owner_Zion Energy LLC', 'owner_Zotos International',
       'owner_Zumbro Garden LLC', 'owner_Zumbro Solar LLC', 'owner_esVolta LP',
       'owner_iEnergy Wind Farms'],
      dtype='object', length=5903)
Index(['country_long', 'name', 'gppd_idnr', 'latitude', 'longitude',
       'other_fuel1', 'other_fuel2', 'other_fuel3', 'commissioning_year',
       'source',
       ...
       'owner_Zamil New Delhi Infrastructure Private Limited',
       'owner_Zapco Energy Tactics Corp', 'owner_Zeeland Farm Services',
       'owner_Zero Waste Energy Development Company LLC',
       'owner_Zion Energy LLC', 'owner_Zotos International

In [23]:
X_train_encoded_pf = X_train_pf.copy()
X_test_encoded_pf = X_test_pf.copy()
X_train_encoded_cmw = X_train_cmw.copy()
X_test_encoded_cmw = X_test_cmw.copy()
