In [1]:
#Global Power Plant Database

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, mean_squared_error

# Step 1: Data Loading
# Assuming you've downloaded the datasets and stored them in the current directory
data_path = "D:\Datasets"

# Load datasets
df1 = pd.read_csv(data_path + "/database_IND.csv")
df2 = pd.read_csv(data_path + "/database_AUS.csv")
df3 = pd.read_csv(data_path + "/database_USA.csv")

# Step 2: Combine datasets
# Concatenate the datasets vertically
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

# Step 3: Data Preprocessing
# Handle missing or inconsistent data
# Perform one-hot encoding on categorical variables
combined_df = pd.get_dummies(combined_df)

# Fill missing values with mean
imputer = SimpleImputer(strategy='mean')
X_filled = pd.DataFrame(imputer.fit_transform(combined_df), columns=combined_df.columns)

# Step 4: Feature Engineering
# No specific feature engineering is done in this example, as the data is already in a suitable format

# Step 5: Model Selection
# For fuel type prediction (classification)
X_classification = X_filled.drop(columns=['primary_fuel'])  # Features
y_classification = combined_df['primary_fuel']  # Target

X_train, X_test, y_train, y_test = train_test_split(X_classification, y_classification, test_size=0.2, random_state=42)

# For capacity prediction (regression)
X_regression = X_filled.drop(columns=['capacity_mw'])  # Features
y_regression = combined_df['capacity_mw']  # Target

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_regression, y_regression, test_size=0.2, random_state=42)

# Step 6: Model Training
# Classification model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Regression model
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_reg, y_train_reg)

# Step 7: Model Evaluation
# Classification model evaluation
y_pred_classification = clf.predict(X_test)
classification_accuracy = accuracy_score(y_test, y_pred_classification)
print("Classification Accuracy:", classification_accuracy)

# Regression model evaluation
y_pred_regression = reg.predict(X_test_reg)
regression_mse = mean_squared_error(y_test_reg, y_pred_regression)
print("Regression Mean Squared Error:", regression_mse)

# Step 8: Prediction
# You can now use the trained models to make predictions on new data
# For example:
new_data = pd.DataFrame(...)  # New data for prediction

# Perform one-hot encoding on new data (if applicable)
new_data_encoded = pd.get_dummies(new_data)

# Fill missing values with mean
new_data_filled = pd.DataFrame(imputer.transform(new_data_encoded), columns=new_data_encoded.columns)

fuel_prediction = clf.predict(new_data_filled)
capacity_prediction = reg.predict(new_data_filled)
print("Fuel Type Prediction:", fuel_prediction)
print("Capacity Prediction (MW):", capacity_prediction)
