#Objective:
###This notebook replicates the methodology from the research paper "Renewable Energy Evaluation using Data Mining Techniques". We'll use the K-Means clustering algorithm to analyze weather data and identify months with the highest potential for solar energy generation, based on temperature patterns<p>

In [1]:
#libraries
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import matplotlib.pyplot as plt

####Data Augmentation through LLM

In [2]:

data = {
    'City': ['Chennai']*12 + ['Kanyakumari']*12 + ['Madurai']*12,
    'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] * 3,
    'Max_Mean_Temp_C': [
        # Chennai Temperatures
        29.3, 31.1, 33.2, 35.1, 37.1, 36.9, 35.0, 34.4, 34.0, 31.9, 29.9, 28.9,
        # Kanyakumari Temperatures
        30.4, 31.2, 32.1, 32.5, 32.0, 29.9, 29.5, 29.6, 29.9, 30.1, 30.0, 30.1,
        # Madurai Temperatures
        30.6, 33.2, 35.8, 37.2, 37.7, 36.8, 35.9, 35.3, 34.4, 32.6, 30.5, 29.7
    ],
    'Min_Mean_Temp_C': [
        # Chennai Minimums
        21.2, 22.4, 24.5, 26.7, 28.0, 27.5, 26.3, 25.7, 25.5, 24.5, 23.0, 21.9,
        # Kanyakumari Minimums
        23.5, 23.8, 24.5, 25.1, 25.4, 24.5, 24.1, 24.0, 24.0, 24.1, 23.9, 23.6,
        # Madurai Minimums
        21.5, 22.8, 24.7, 26.3, 27.1, 26.8, 26.2, 25.8, 25.3, 24.4, 23.1, 21.9
    ]
}
df =pd.DataFrame(data)
df.head()

Unnamed: 0,City,Month,Max_Mean_Temp_C,Min_Mean_Temp_C
0,Chennai,Jan,29.3,21.2
1,Chennai,Feb,31.1,22.4
2,Chennai,Mar,33.2,24.5
3,Chennai,Apr,35.1,26.7
4,Chennai,May,37.1,28.0


####Data preprocessing : for attribute reduction of temperature


In [3]:
features = df[['Max_Mean_Temp_C', 'Min_Mean_Temp_C']]

In [4]:
#model training
kmeans = KMeans(n_clusters=2,random_state=42,n_init='auto')
kmeans.fit(features)


0,1,2
,n_clusters,2
,init,'k-means++'
,n_init,'auto'
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,42
,copy_x,True
,algorithm,'lloyd'


In [5]:
df['cluster'] = kmeans.labels_
print("## K-Means Clustering Results ##\n")

cluster_0 = df[df['cluster'] == 0]
cluster_1 = df[df['cluster'] == 1]

print("Cluster 0: Moderate Temperature Months")
print(cluster_0[['City', 'Month', 'Max_Mean_Temp_C']].sort_values(by='Max_Mean_Temp_C'))
print("\n-------------------------------------------------\n")
print("Cluster 1: High Temperature / High Solar Potential Months ")
print(cluster_1[['City', 'Month', 'Max_Mean_Temp_C']].sort_values(by='Max_Mean_Temp_C'))
print("\n-------------------------------------------------\n")

## K-Means Clustering Results ##

Cluster 0: Moderate Temperature Months
           City Month  Max_Mean_Temp_C
11      Chennai   Dec             28.9
0       Chennai   Jan             29.3
18  Kanyakumari   Jul             29.5
19  Kanyakumari   Aug             29.6
35      Madurai   Dec             29.7
20  Kanyakumari   Sep             29.9
10      Chennai   Nov             29.9
17  Kanyakumari   Jun             29.9
22  Kanyakumari   Nov             30.0
23  Kanyakumari   Dec             30.1
21  Kanyakumari   Oct             30.1
12  Kanyakumari   Jan             30.4
34      Madurai   Nov             30.5
24      Madurai   Jan             30.6
1       Chennai   Feb             31.1
13  Kanyakumari   Feb             31.2
9       Chennai   Oct             31.9
16  Kanyakumari   May             32.0
14  Kanyakumari   Mar             32.1
15  Kanyakumari   Apr             32.5
33      Madurai   Oct             32.6
2       Chennai   Mar             33.2
25      Madurai   Feb         

## Model's findings align with the paper's conclusions?

In [6]:
high_temp_months = sorted(cluster_1['Month'].unique())
print(f"CONCLUSION: The model successfully grouped the hottest months into a distinct cluster.")
print(f"The months in the high-potential cluster are: {high_temp_months}.")

print("This aligns with the paper's finding that months like 'May' and 'June'"
"are optimal for solar radiation[cite: 176, 178].")

CONCLUSION: The model successfully grouped the hottest months into a distinct cluster.
The months in the high-potential cluster are: ['Apr', 'Aug', 'Jul', 'Jun', 'Mar', 'May', 'Sep'].
This aligns with the paper's finding that months like 'May' and 'June'are optimal for solar radiation[cite: 176, 178].


##visualization

In [7]:

df['cluster_label'] = df['cluster'].map({0: 'Moderate Potential', 1: 'High Potential ☀️'})

fig = px.scatter(
    df,
    x='Max_Mean_Temp_C',
    y='Min_Mean_Temp_C',
    color='cluster_label',        
    symbol='City',
    hover_data=['Month'],
    title='<b>K-Means Clustering of Monthly Weather Data</b>',
    labels={
        "Max_Mean_Temp_C": "Maximum Mean Temperature (°C)",
        "Min_Mean_Temp_C": "Minimum Mean Temperature (°C)"
    },
    color_discrete_map={
        'High Potential ☀️': 'gold',
        'Moderate Potential': 'royalblue'
    },
    template='plotly_white'
)

fig.update_layout(
    title_font_size=22,
    legend_title_text='<b>Cluster Group</b>'
)
fig.show()
