In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [3]:
# Load the dataset
data = pd.read_csv('weatherAUS.csv')

In [4]:
# Drop rows with missing values in the target column ('RainTomorrow')
data = data.dropna(subset=['RainTomorrow'])

In [5]:
# Select relevant features
features = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed',
            'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
            'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']


In [6]:
# Handle missing values in numerical columns by filling with mean values
data[features] = data[features].fillna(data[features].mean())

In [7]:
# Encode categorical variable 'RainToday'
le = LabelEncoder()
data['RainToday'] = le.fit_transform(data['RainToday'])

In [8]:
# Scale the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[features])

In [9]:
# Apply PCA to reduce dimensionality for visualization
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_scaled)

In [None]:
# Determine the optimal number of clusters using silhouette score
max_clusters = 10
silhouette_scores = []

for k in range(2, max_clusters + 1):
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(data_scaled)
    silhouette_avg = silhouette_score(data_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)

In [None]:
optimal_k = np.argmax(silhouette_scores) + 2  # Add 2 because range started from 2

In [None]:
# Perform K-Means clustering with optimal number of clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(data_scaled)

In [None]:
# Visualize clusters using PCA components
plt.figure(figsize=(10, 8))
scatter = plt.scatter(data_pca[:, 0], data_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.5)
plt.title(f'K-Means Clustering (k={optimal_k})')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(scatter, label='Cluster')
plt.show()

In [None]:
# Display cluster centers
cluster_centers = pca.inverse_transform(kmeans.cluster_centers_)
cluster_centers_df = pd.DataFrame(cluster_centers, columns=features)
print("\nCluster Centers:")
print(cluster_centers_df)