In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [None]:
df=pd.read_csv('D:/Objective 1 code/classification/crop_queryTypes.csv')

In [None]:
# Split the Crop_QueryType column into a list of values
df['Crop_QueryType_List'] = df['Crop_QueryType'].str.split(',')


In [None]:
# Step 2: Create a feature matrix using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
crop_querytype_matrix = mlb.fit_transform(df['Crop_QueryType_List'])

# Convert the matrix to a DataFrame for better readability
crop_querytype_df = pd.DataFrame(crop_querytype_matrix, columns=mlb.classes_)

# Add Month and Place to the feature matrix
crop_querytype_df['Month'] = df['Month']
crop_querytype_df['Place'] = df['Place']

In [None]:
# Group by Place and Month, then aggregate the Crop_QueryType frequencies
place_month_features = crop_querytype_df.groupby(['Place', 'Month']).sum().reset_index()

In [None]:
print(df)

In [None]:
# Drop Place and Month for clustering
place_features = place_month_features.drop(columns=['Place', 'Month'])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(place_features)

# Step 5: Apply Spectral Clustering
# Choose the number of clusters (e.g., 5)
n_clusters = 5

from sklearn.cluster import AgglomerativeClustering

# Assuming n_clusters is defined and X_scaled is your scaled data
agglo = AgglomerativeClustering(n_clusters=n_clusters)

# Fit the model and get the cluster labels for each sample
labels = agglo.fit_predict(X_scaled)

# Add cluster labels to the DataFrame
place_month_features['Cluster'] = labels

In [None]:
# Step 4: Analyze Clusters
# Function to get the Crop_QueryType with maximum occurrence and list of places in each cluster
def analyze_clusters(cluster_data, crop_querytype_df, top_n=10):
    cluster_results = {}
    for cluster in cluster_data['Cluster'].unique():
        cluster_subset = cluster_data[cluster_data['Cluster'] == cluster]
        
        # Get the top 10 Crop_QueryTypes in the cluster
        crop_querytype_counts = cluster_subset.drop(columns=['Place', 'Month', 'Cluster']).sum()
        top_crop_querytypes = crop_querytype_counts.sort_values(ascending=False).head(top_n).to_dict()
        
        # Get the list of places in the cluster
        places_in_cluster = cluster_subset['Place'].unique().tolist()
        
        # Store results
        cluster_results[cluster] = {
            'Top_Crop_QueryTypes': top_crop_querytypes,
            'Places': places_in_cluster
        }
    return cluster_results

In [None]:
# Analyze clusters
cluster_results = analyze_clusters(place_month_features, crop_querytype_df, top_n=10)

In [None]:

# Step 5: Print the results
for cluster, result in cluster_results.items():
    print(f"Cluster {cluster}:")
    print("  Top 10 Crop_QueryTypes:")
    for crop_querytype, count in result['Top_Crop_QueryTypes'].items():
        print(crop_querytype)
    print("\n")    
    print("  Places in the cluster:")
    print(f"    {result['Places']}")
    print("\n")

In [None]:
from collections import Counter
def analyze_clusters(cluster_data, crop_querytype_df):
    # For the first CSV: Month, Cluster, Crop_QueryType (without count)
    crop_querytype_results = []
    
    # For the second CSV: Places, Cluster, Most_Common_State (not month-wise)
    place_results = []
    
    # Extract states from places (StateName_DistrictName)
    cluster_data['State'] = cluster_data['Place'].str.split('_').str[0]
    
    # Group by Cluster to find most common state and places
    for cluster in cluster_data['Cluster'].unique():
        cluster_subset = cluster_data[cluster_data['Cluster'] == cluster]
        
        # Get the most common state in the cluster
        most_common_state = Counter(cluster_subset['State']).most_common(25)
        
        # Get the list of places in the cluster
        places_in_cluster = cluster_subset['Place'].unique().tolist()
        
        # Add results to the place_results list
        place_results.append({
            'Cluster': int(cluster),
            'Most_Common_State': most_common_state,
            'Places': ', '.join(places_in_cluster)
        })
    
    # Group by Month and Cluster to find top Crop_QueryTypes
    for month in cluster_data['Month'].unique():
        month_subset = cluster_data[cluster_data['Month'] == month]
        for cluster in month_subset['Cluster'].unique():
            cluster_subset = month_subset[month_subset['Cluster'] == cluster]
            
            # Get the top 5 Crop_QueryTypes in the cluster for this month
            crop_querytype_counts = cluster_subset.drop(columns=['Place', 'Month', 'Cluster', 'State']).sum()
            top_crop_querytypes = crop_querytype_counts.sort_values(ascending=False).head(5)
            crop_querytypes = ', '.join(top_crop_querytypes.index.tolist())
            # Add results to the crop_querytype_results list
            for crop_querytype in top_crop_querytypes.index:
                crop_querytype_results.append({
                    'Month': int(month),
                    'Cluster': int(cluster),
                    'Crop_QueryType': crop_querytypes
                })
    
    return crop_querytype_results, place_results

In [None]:
crop_querytype_results, place_results = analyze_clusters(place_month_features, crop_querytype_df)

In [None]:
crop_querytype_df = pd.DataFrame(crop_querytype_results)
crop_querytype_df=crop_querytype_df.drop_duplicates()
print(crop_querytype_df)
crop_querytype_df.to_csv('D:/Objective 1 code/crop_querytype.csv', index=False)

In [None]:
# Second CSV: Places, Cluster, Most_Common_State
place_df = pd.DataFrame(place_results)

place_df.to_csv("D:/Objective 1 code/place_results.csv", index=False)

In [None]:
# Step 4: Calculate Mode Cluster for Each Place
def calculate_mode_cluster(cluster_data):
    mode_cluster_results = []
    
    # Group by Place and calculate the mode cluster
    for place in cluster_data['Place'].unique():
        place_subset = cluster_data[cluster_data['Place'] == place]
        
        # Get the mode cluster for this place
        mode_cluster = Counter(place_subset['Cluster']).most_common(1)[0][0]
        
        # Add results to the mode_cluster_results list
        mode_cluster_results.append({
            'Place': place,
            'Mode_Cluster': int(mode_cluster)
        })
    
    return mode_cluster_results


In [None]:
# Calculate mode cluster for each place
mode_cluster_results = calculate_mode_cluster(place_month_features)
cluster_places=pd.DataFrame(mode_cluster_results)
print(cluster_places)

In [None]:
cluster_places.to_csv("D:/Objective 1 code/places_clusters.csv")

In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Step 1: Extract features and cluster labels
X = place_features  # Feature matrix (without Place and Month)
labels = place_month_features['Cluster']  # Cluster labels

# Step 2: Calculate evaluation metrics
# Silhouette Score
silhouette_avg = silhouette_score(X, labels)
print(f"Silhouette Score: {silhouette_avg}")

# Calinski-Harabasz Index
calinski_harabasz = calinski_harabasz_score(X, labels)
print(f"Calinski-Harabasz Index: {calinski_harabasz}")

# Davies-Bouldin Index
davies_bouldin = davies_bouldin_score(X, labels)
print(f"Davies-Bouldin Index: {davies_bouldin}")

# Inertia (from KMeans)
inertia = kmeans.inertia_
print(f"Inertia: {inertia}")