In [3]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pickle

# Load the dataset
file_path = 'Merged_Dataset.csv'  # Update this path to the dataset location
df = pd.read_csv(file_path)


In [None]:
df['county'] = df['county'].str.strip("b'").astype('category')

In [6]:
for col in ['country', 'state', 'urban']:
    df[col] = (
        df[col]
        .str.strip("b'")          # remove leading b' and trailing '
        .astype('category')       # convert to pandas CategoricalDtype
    )

In [7]:
df.head

<bound method NDFrame.head of        latitude  longitude  elevation  timezone        country    state  \
0         31.33    -111.06  1482.8500        -7  United States  Arizona   
1         31.33    -111.02  1323.4500        -7  United States  Arizona   
2         31.33    -110.98  1266.8000        -7  United States  Arizona   
3         31.33    -110.94  1205.0500        -7  United States  Arizona   
4         31.33    -110.90  1220.1875        -7  United States  Arizona   
...         ...        ...        ...       ...            ...      ...   
18077     36.97    -109.22  1583.4400        -7  United States  Arizona   
18078     36.97    -109.18  1594.6000        -7  United States  Arizona   
18079     36.97    -109.14  1538.2400        -7  United States  Arizona   
18080     36.97    -109.10  1485.8500        -7  United States  Arizona   
18081     36.97    -109.06  1493.6800        -7  United States  Arizona   

           county    urban  population  landcover  ...  wind_mean_202

In [8]:
df.to_csv('cleaned_dataset.csv', index=False)

In [9]:
data = pd.read_csv('cleaned_dataset.csv')


In [12]:
import pandas as pd
import plotly.graph_objects as go

# Assuming 'data' is your DataFrame containing the dataset

# Prepare data for Solar Energy Production by Year (2020, 2021, 2022)
solar_energy_data = {
    'Year': ['2020', '2021', '2022'],
    'Solar_Energy': [
        data['Solar_Energy2022'].mean(),
        data['Solar_Energy2021'].mean(),
        data['Solar_Energy2020'].mean()
    ]
}

# Convert to DataFrame
solar_energy_df = pd.DataFrame(solar_energy_data)

# Create Line Chart using Plotly
fig = go.Figure()

fig.add_trace(go.Scatter(x=solar_energy_df['Year'], 
                         y=solar_energy_df['Solar_Energy'], 
                         mode='lines+markers', 
                         name='Solar Energy Production'))

fig.update_layout(
    title="Solar Energy Production by Year (2020-2022)",
    xaxis_title="Year",
    yaxis_title="Average Solar Energy Production (kWh)",
    template="plotly_dark"
)

# Display the chart
fig.show()


In [36]:
import pandas as pd
import plotly.graph_objects as go

# Filter data for Maricopa County
maricopa_data = data[data['county'] == 'Maricopa']

# Prepare data for Solar Energy Production by Year (2020, 2021, 2022) for Maricopa County
maricopa_solar_data = {
    'Year': ['2020', '2021', '2022'],
    'Solar_Energy': [
        maricopa_data['Solar_Energy2022'].mean(),
        maricopa_data['Solar_Energy2021'].mean(),
        maricopa_data['Solar_Energy2020'].mean()
    ]
}

# Convert to DataFrame
maricopa_solar_df = pd.DataFrame(maricopa_solar_data)

# Create Line Chart using Plotly
fig = go.Figure()

fig.add_trace(go.Scatter(x=maricopa_solar_df['Year'], 
                         y=maricopa_solar_df['Solar_Energy'], 
                         mode='lines+markers', 
                         name='Solar Energy Production'))

fig.update_layout(
    title="Solar Energy Production in Maricopa County by Year (2020-2022)",
    xaxis_title="Year",
    yaxis_title="Average Solar Energy Production (kWh)",
    template="plotly_dark"
)

# Display the chart
fig.show()


In [14]:
import plotly.express as px

# Assuming 'data' is your DataFrame containing the dataset
geo_map_fig = px.scatter_geo(data, 
                             lat='latitude', 
                             lon='longitude', 
                             color='Solar_Energy2022',
                             hover_name='county',
                             color_continuous_scale='Viridis', 
                             title="Solar Energy Production by Location (Latitude and Longitude) - 2022",
                             labels={'Solar_Energy2022': 'Solar Energy Production (kWh)'},
                             projection="natural earth")

geo_map_fig.update_geos(showcoastlines=True, coastlinecolor="Black", projection_type="albers usa")
geo_map_fig.update_layout(title_x=0.5)

# Display the geo map
geo_map_fig.show()


In [15]:
import plotly.express as px

# Group by county and calculate average solar energy production in 2022
county_avg_production = data.groupby('county')['Solar_Energy2022'].mean().reset_index()

# Create a bar chart
bar_chart_fig = px.bar(county_avg_production, 
                       x='county', 
                       y='Solar_Energy2022', 
                       title="Annual Solar Energy Production by County (2022)", 
                       labels={'Solar_Energy2022': 'Average Solar Energy Production (kWh)', 'county': 'County'})

bar_chart_fig.update_layout(xaxis_tickangle=-45)

# Display the bar chart
bar_chart_fig.show()


In [16]:
import plotly.express as px

# Features to compare with Solar Energy Production
features = ['ghi_mean2022', 'dni_mean2022', 'dhi_mean2022', 'temp_mean2022', 'wind_mean2022']
comparison_data = data[features + ['Solar_Energy2022']]

# Create a multi-line chart
multi_line_fig = px.line(comparison_data, 
                         x=comparison_data.index, 
                         y=features, 
                         title="Comparison of Different Features for Solar Energy Prediction",
                         labels={'value': 'Feature Value', 'variable': 'Feature'},
                         line_shape='linear')

multi_line_fig.update_layout(
    title="Comparison of Weather Features with Solar Energy Production",
    xaxis_title="Index",
    yaxis_title="Feature Value / Solar Energy Production (kWh)"
)

# Show the chart
multi_line_fig.show()


In [17]:
import plotly.express as px

# Scatter Plot for Solar Energy vs GHI
ghi_solar_fig = px.scatter(data, 
                           x='ghi_mean2022', 
                           y='Solar_Energy2022', 
                           title="Solar Energy vs GHI (Global Horizontal Irradiance)",
                           labels={'ghi_mean2022': 'Global Horizontal Irradiance (W/m²)', 'Solar_Energy2022': 'Solar Energy Production (kWh)'})

# Show the scatter plot
ghi_solar_fig.show()


In [18]:
# Scatter Plot for Solar Energy vs Temperature
temp_solar_fig = px.scatter(data, 
                            x='temp_mean2022', 
                            y='Solar_Energy2022', 
                            title="Solar Energy Production vs Temperature",
                            labels={'temp_mean2022': 'Temperature (°C)', 'Solar_Energy2022': 'Solar Energy Production (kWh)'})

# Show the scatter plot
temp_solar_fig.show()


In [19]:
# Scatter Plot for Solar Energy vs Wind Speed
wind_solar_fig = px.scatter(data, 
                             x='wind_mean2022', 
                             y='Solar_Energy2022', 
                             title="Solar Energy Production vs Wind Speed",
                             labels={'wind_mean2022': 'Wind Speed (m/s)', 'Solar_Energy2022': 'Solar Energy Production (kWh)'})

# Show the scatter plot
wind_solar_fig.show()


In [20]:
# Scatter Plot for Solar Energy vs Surface Pressure
pressure_solar_fig = px.scatter(data, 
                                 x='sp_mean2022', 
                                 y='Solar_Energy2022', 
                                 title="Solar Energy vs Surface Pressure",
                                 labels={'sp_mean2022': 'Surface Pressure (hPa)', 'Solar_Energy2022': 'Solar Energy Production (kWh)'})

# Show the scatter plot
pressure_solar_fig.show()


In [21]:
# Boxplot for Solar Energy Distribution vs Elevation
elevation_solar_fig = px.box(data, 
                              x='elevation', 
                              y='Solar_Energy2022', 
                              title="Solar Energy Distribution Across Different Elevation Levels",
                              labels={'elevation': 'Elevation (meters)', 'Solar_Energy2022': 'Solar Energy Production (kWh)'})

# Show the boxplot
elevation_solar_fig.show()


In [24]:
# Scatter Plot for Solar Energy vs Population Density
data['population_density'] = data['population'] / data['landcover']  # Assuming 'landcover' corresponds to area

pop_density_solar_fig = px.scatter(data, 
                                   x='population_density', 
                                   y='Solar_Energy2022', 
                                   title="Solar Energy vs Population Density",
                                   labels={'population_density': 'Population Density (people/km²)', 'Solar_Energy2022': 'Solar Energy Production (kWh)'})

pop_density_solar_fig.show()


In [25]:
# Bar Chart for Solar Energy Production in Urban vs Non-Urban Areas
urban_solar_fig = px.box(data, 
                         x='urban', 
                         y='Solar_Energy2022', 
                         title="Solar Energy Production in Urban vs Non-Urban Areas",
                         labels={'urban': 'Urban/Non-Urban', 'Solar_Energy2022': 'Solar Energy Production (kWh)'})

urban_solar_fig.show()


In [34]:
# Create wind speed categories
wind_speed_bins = [0, 5, 10, 15, 20, 30]
wind_speed_labels = ['0-5', '5-10', '10-15', '15-20', '20-30']
data['wind_speed_category'] = pd.cut(data['wind_mean2022'], bins=wind_speed_bins, labels=wind_speed_labels)

# Categorical scatter plot
wind_speed_category_fig = px.scatter(data, 
                                     x='wind_speed_category', 
                                     y='Solar_Energy2022', 
                                     title="Solar Energy Production vs Wind Speed (Categorical)",
                                     labels={'wind_speed_category': 'Wind Speed (Category)', 'Solar_Energy2022': 'Solar Energy Production (kWh)'})

wind_speed_category_fig.show()
