In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import warnings
import joblib

warnings.filterwarnings('ignore')

usa_url = 'https://github.com/FlipRoboTechnologies/ML_-Datasets/blob/main/Global%20Power%20Plant%20Database/USA.csv?raw=true'
aus_url = 'https://github.com/FlipRoboTechnologies/ML_-Datasets/blob/main/Global%20Power%20Plant%20Database/AUS.csv?raw=true'
india_url = 'https://github.com/FlipRoboTechnologies/ML_-Datasets/blob/main/Global%20Power%20Plant%20Database/INDIA.csv?raw=true'

usa_data = pd.read_csv(usa_url)
aus_data = pd.read_csv(aus_url)
india_data = pd.read_csv(india_url)

data = pd.concat([usa_data, aus_data, india_data], ignore_index=True)

print("First few rows of the combined dataset:")
print(data.head())

print("\nBasic information about the dataset:")
print(data.info())
print("\nSummary statistics of the dataset:")
print(data.describe())

missing_values = data.isnull().sum()
print("\nMissing values in the dataset:")
print(missing_values[missing_values > 0])

plt.figure(figsize=(12, 8))
sns.countplot(data['primary_fuel'])
plt.title('Distribution of Primary Fuel')
plt.show()

plt.figure(figsize=(12, 8))
sns.histplot(data['capacity_mw'], bins=30, kde=True)
plt.title('Distribution of Capacity (MW)')
plt.show()

plt.figure(figsize=(14, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

data.fillna({
    'capacity_mw': data['capacity_mw'].median(),
    'commissioning_year': data['commissioning_year'].mode()[0],
    'owner': 'Unknown',
    'source': 'Unknown',
    'url': 'Unknown',
    'geolocation_source': 'Unknown',
    'wepp_id': 'Unknown',
    'year_of_capacity_data': data['year_of_capacity_data'].mode()[0],
    'generation_gwh_2013': 0,
    'generation_gwh_2014': 0,
    'generation_gwh_2015': 0,
    'generation_gwh_2016': 0,
    'generation_gwh_2017': 0,
    'generation_gwh_2018': 0,
    'generation_gwh_2019': 0,
    'estimated_generation_gwh_2013': 0,
    'estimated_generation_gwh_2014': 0,
    'estimated_generation_gwh_2015': 0,
    'estimated_generation_gwh_2016': 0,
    'estimated_generation_gwh_2017': 0
}, inplace=True)

label_encoder = LabelEncoder()
categorical_cols = ['country', 'country_long', 'name', 'primary_fuel', 'other_fuel1', 'other_fuel2', 'other_fuel3', 'owner', 'source', 'geolocation_source', 'generation_data_source']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

data.drop(['gppd_idnr', 'url', 'wepp_id', 'estimated_generation_note_2013', 'estimated_generation_note_2014', 'estimated_generation_note_2015', 'estimated_generation_note_2016', 'estimated_generation_note_2017'], axis=1, inplace=True)

X = data.drop(['primary_fuel', 'capacity_mw'], axis=1)
y_fuel = data['primary_fuel']
y_capacity = data['capacity_mw']

X_train_fuel, X_test_fuel, y_train_fuel, y_test_fuel = train_test_split(X, y_fuel, test_size=0.2, random_state=42)

X_train_capacity, X_test_capacity, y_train_capacity, y_test_capacity = train_test_split(X, y_capacity, test_size=0.2, random_state=42)

models_fuel = {
    'RandomForestClassifier': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'DecisionTreeClassifier': DecisionTreeClassifier()
}

models_capacity = {
    'RandomForestRegressor': RandomForestRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor()
}

print("\nPrimary Fuel Prediction:")
for name, model in models_fuel.items():
    model.fit(X_train_fuel, y_train_fuel)
    y_pred_fuel = model.predict(X_test_fuel)
    print(f'{name} - Accuracy Score:', accuracy_score(y_test_fuel, y_pred_fuel))
    print(f'{name} - Classification Report:\n', classification_report(y_test_fuel, y_pred_fuel))
    print(f'{name} - Confusion Matrix:\n', confusion_matrix(y_test_fuel, y_pred_fuel))

print("\nCapacity Prediction:")
for name, model in models_capacity.items():
    model.fit(X_train_capacity, y_train_capacity)
    y_pred_capacity = model.predict(X_test_capacity)
    print(f'{name} - Mean Squared Error:', mean_squared_error(y_test_capacity, y_pred_capacity))

param_grid_fuel = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_fuel = GridSearchCV(RandomForestClassifier(), param_grid_fuel, cv=5, scoring='accuracy', n_jobs=-1)
grid_fuel.fit(X_train_fuel, y_train_fuel)
best_model_fuel = grid_fuel.best_estimator_


param_grid_capacity = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_capacity = GridSearchCV(RandomForestRegressor(), param_grid_capacity, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_capacity.fit(X_train_capacity, y_train_capacity)
best_model_capacity = grid_capacity.best_estimator_


y_pred_fuel_best = best_model_fuel.predict(X_test_fuel)
print('\nBest Model for Primary Fuel Prediction - Accuracy Score:', accuracy_score(y_test_fuel, y_pred_fuel_best))
print('Best Model for Primary Fuel Prediction - Classification Report:\n', classification_report(y_test_fuel, y_pred_fuel_best))
print('Best Model for Primary Fuel Prediction - Confusion Matrix:\n', confusion_matrix(y_test_fuel, y_pred_fuel_best))

y_pred_capacity_best = best_model_capacity.predict(X_test_capacity)
print('\nBest Model for Capacity Prediction - Mean Squared Error:', mean_squared_error(y_test_capacity, y_pred_capacity_best))

joblib.dump(best_model_fuel, 'best_model_primary_fuel.pkl')
joblib.dump(best_model_capacity, 'best_model_capacity_mw.pkl')

print("Models saved successfully.")
