In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.graph_objs as go
import random as rd
import plotly.express as px

Before we get started, we import the dataset to be analysed and learn a bit about it.
Since we have the water quality data for the year 2014, we'll focus on the 2014 production here

In [2]:
df = pd.read_csv("apy.csv")
df_2014 = df[df["Crop_Year"] == 2014]
df_2014.head()

FileNotFoundError: [Errno 2] File b'apy.csv' does not exist: b'apy.csv'

In [None]:
df_2014["Season"].unique()

We'll cluster separately based on the seasons above

First, let's define a few helper functions we'll call throughout the notebook

## K Means Clustering

In [None]:
def k_means(k, dfa, n_iter):
    centroids = np.array([]).reshape(n, 0)
    for i in range(k):
        rand = rd.randint(0, m-1)
        centroids = np.c_[centroids, dfa[rand]]
        
    for i in range(n_iter):
        dist_mat = np.array([]).reshape(m, 0)
        for i in range(k):
            tempDist = np.sum((dfa-centroids[:, i])**2, axis=1)
            dist_mat = np.c_[dist_mat, tempDist]

        C = np.argmin(dist_mat, axis=1)+1

        Y={}
        for i in range(k):
            Y[i+1]=np.array([]).reshape(n,0)
        for i in range(m):
            Y[C[i]]=np.c_[Y[C[i]],dfa[i]]

        for i in range(k):
            Y[i+1]=Y[i+1].T

        for i in range(k):
            centroids[:,i]=np.mean(Y[i+1],axis=0)
        Output=Y
        
    return Output, centroids

In [None]:
def calc_wss(k, output, centroids):
    for i in range(k):
        c = i+1
        sum = 0
        sum += cost(c, centroids, output)
        
    return sum

def cost(n, centroids, output):
    center = centroids.T[n-1]
    dist = 0
    for i in range(output[n].shape[0]):
        dist += np.linalg.norm(center - output[n][i])
    return dist

In [None]:
def make_elbow_graph(dfa):
    for k in range(1, 5):
        output, centroids = k_means(k, dfa, 300)
        sum = calc_wss(k, output, centroids)
        k_scores[k] = sum

## DBSCAN

In [None]:
def dbscan(D, eps, MinPts):
    labels = [0]*len(D)
    C = 0
    
    for P in range(len(D)):
        if labels[P] != 0:
            continue
            
        NeighbourPts = regionQuery(D, P, eps)
        if len(NeighbourPts) < MinPts:
            labels[P] = -1
        else:
            C += 1
            growCluster(D, labels, P, NeighbourPts, C, eps, MinPts)
    
    return labels

In [None]:
def growCluster(D, labels, P, NeighbourPts, C, eps, MinPts):
    labels[P] = C
    i = 0
    while i < len(NeighbourPts):
        Pn = NeighbourPts[i]
        if labels[Pn] == -1:
            labels[Pn] = C
        elif labels[Pn] == 0:
            labels[Pn] = C
            PnNeighbours = regionQuery(D, Pn, eps)
            
            if len(PnNeighbours) >= MinPts:
                NeighbourPts += PnNeighbours
                
        i += 1

In [None]:
def regionQuery(D, P, eps):
    neighbours = []
    
    for Pn in range(len(D)):
        if np.linalg.norm(D[P] - D[Pn]) < eps:
            neighbours.append(Pn)
            
    return neighbours

## Other functions

In [None]:
def make_dfa(df):
    df_1 = df.groupby('State_Name', as_index=False).sum()
    df_1["Prod/Area"] = df_1["Production"]/df_1["Area"]
    dfa = df_1.drop(["State_Name", "Area", "Production"], axis=1)
    dfa = np.array(dfa)
    dfa, mu, sigma = normalize_data(dfa)
    return dfa, df_1

In [None]:
def plot_graph(k, dfa):
    output, centroids = k_means(k, dfa, 300)
    color=['red','blue', 'green' 'yellow']
    labels=['cluster1','cluster2', 'cluster3']
    for i in range(k):
        plt.plot(output[i+1][:,0],"*", c=color[i],label=labels[i])
    plt.xlabel('0')
    plt.legend()
    plt.show()
    return output

In [None]:
def make_clusters_k(k, output, dfa, df):
    for i in range(k):
        for j in range(output[i+1].shape[0]):
            a = output[i+1][j]
            for z in range(m):
                b = np.array(dfa[z])
                if(np.sum(a-b) == 0.0):
                    if i == 0:
                        cluster_1.append(z)
                    elif i == 1:
                        cluster_2.append(z)
                    
    clusters = {1:[], 2:[]}
    for i in range(len(cluster_1)):
        clusters[1].append(df.iloc[cluster_1[i]]["State_Name"])
    for i in range(len(cluster_2)):
        clusters[2].append(df.iloc[cluster_2[i]]["State_Name"])
        
    return clusters

In [None]:
def make_clusters_k3(k, output, dfa, df):
    for i in range(k):
        for j in range(output[i+1].shape[0]):
            a = output[i+1][j]
            for z in range(m):
                b = np.array(dfa[z])
                if(np.sum(a-b) == 0.0):
                    if i == 0:
                        cluster_1.append(z)
                    elif i == 1:
                        cluster_2.append(z)
                    elif i == 2:
                        cluster_3.append(z)
                    
    clusters = {1:[], 2:[]}
    for i in range(len(cluster_1)):
        clusters[1].append(df.iloc[cluster_1[i]]["State_Name"])
    for i in range(len(cluster_2)):
        clusters[2].append(df.iloc[cluster_2[i]]["State_Name"])
    for i in range(len(cluster_3)):
        clusters[3].append(df.iloc[cluster_3[i]]["State_Name"])
        
    return clusters

In [None]:
def make_map(clusters, season):
    df_loc = pd.read_csv("states.csv")

    for i in range(df_loc.shape[0]):
        state = df_loc.iloc[i]["State"].lower()
        for j in range(1, len(clusters)+1):
            for c in range(len(clusters[j])):
                temp = clusters[j][c].lower()
                if state == temp:
                    df_loc.set_value(i, "Cluster_db", j)
                    #print("match")

    df_loc = df_loc.dropna()
    fig = px.scatter_geo(df_loc,
                       lat="Latitude",
                       lon="Longitude",
                       color="Cluster_db")

    fig.update_layout(title="States "+str(season)+ " o/p") 

    fig.show()

In [None]:
def normalize_data(data):
    mu = np.mean(data, 0)
    sigma = np.std(data, 0)
    data_norm = (data-mu)/sigma
    return data_norm, mu, sigma

# Kharif Season

In [None]:
df_2014_kharif = df_2014[df_2014["Season"] == "Kharif     "]
df_2014_kharif.head()

In [None]:
df_2014_kharif = df_2014_kharif.drop(["District_Name", "Crop_Year", "Season", "Crop"], axis=1) 
df_2014_kharif

In [None]:
dfa_kharif, df_kharif = make_dfa(df_2014_kharif)
dfa_kharif

In [None]:
m = dfa_kharif.shape[0]
n = dfa_kharif.shape[1]

In [None]:
k_scores ={}
make_elbow_graph(dfa_kharif)
sns.lineplot(x=list(k_scores.keys()), y=list(k_scores.values()))

In [None]:
output = plot_graph(2, dfa_kharif)

In [None]:
cluster_1 = []
cluster_2 = []
clusters = make_clusters_k(2, output, dfa_kharif, df_kharif)
clusters

In [None]:
make_map(clusters, "Kharif")

In [None]:
cluster_labels = dbscan(dfa_kharif, 3, 0.15)
cluster_labels

In [None]:
clusters_db = {1: [], 2: []}
for i in range(len(cluster_labels)):
    x = cluster_labels[i]
    clusters_db[x].append(df_kharif.iloc[i]["State_Name"])
clusters_db

In [None]:
make_map(clusters_db, "Kharif")

The results given by both K means and DBSCAN are the same, so we can proceed with either result

# Rabi Season

In [None]:
df_2014_rabi = df_2014[df_2014["Season"] == "Rabi       "]
df_2014_rabi.head()

In [None]:
df_2014_rabi = df_2014_rabi.drop(["District_Name", "Crop_Year", "Season", "Crop"], axis=1) 
df_2014_rabi

In [None]:
dfa_rabi, df_rabi = make_dfa(df_2014_rabi)
dfa_rabi

In [None]:
m = dfa_rabi.shape[0]
n = dfa_rabi.shape[1]

In [None]:
k_scores ={}
make_elbow_graph(dfa_rabi)
sns.lineplot(x=list(k_scores.keys()), y=list(k_scores.values()))

In [None]:
output = plot_graph(2, dfa_rabi)

In [None]:
cluster_1 = []
cluster_2 = []
clusters = make_clusters_k(2, output, dfa_rabi, df_rabi)
clusters

In [None]:
make_map(clusters, "Rabi")

In [None]:
cluster_labels = dbscan(dfa_rabi, 3, 0.15)
cluster_labels

In [None]:
clusters_db = {1: [], 2: []}
for i in range(len(cluster_labels)):
    x = cluster_labels[i]
    clusters_db[x].append(df_rabi.iloc[i]["State_Name"])
clusters_db

In [None]:
make_map(clusters_db, "Rabi")

Even here, we get the same result from both K means and DBSCAN

# Autumn Season

In [None]:
df_2014_aut = df_2014[df_2014["Season"] == "Autumn     "]
df_2014_aut.head()

In [None]:
df_2014_aut = df_2014_aut.drop(["District_Name", "Crop_Year", "Season", "Crop"], axis=1) 
df_2014_aut

In [None]:
dfa_aut, df_aut = make_dfa(df_2014_aut)
df_aut

In [None]:
m = dfa_aut.shape[0]
n = dfa_aut.shape[1]

In [None]:
k_scores ={}
make_elbow_graph(dfa_aut)
sns.lineplot(x=list(k_scores.keys()), y=list(k_scores.values()))

In [None]:
output = plot_graph(2, dfa_aut)

In [None]:
cluster_1 = []
cluster_2 = []
clusters = make_clusters_k(2, output, dfa_aut, df_aut)
clusters

In [None]:
make_map(clusters, "Autumn")

In [None]:
cluster_labels = dbscan(dfa_aut, 1, 0.08)
cluster_labels

In [None]:
clusters_db = {1: [], 2: []}
for i in range(len(cluster_labels)):
    x = cluster_labels[i]
    clusters_db[x].append(df_aut.iloc[i]["State_Name"])
clusters_db

In [None]:
make_map(clusters_db, "Autumn")

Even here, we get the same result from both K means and DBSCAN

# Summer Season

In [None]:
df_2014_sum = df_2014[df_2014["Season"] == "Summer     "]
df_2014_sum.head()

In [None]:
df_2014_sum = df_2014_sum.drop(["District_Name", "Crop_Year", "Season", "Crop"], axis=1) 
df_2014_sum

In [None]:
dfa_sum, df_sum = make_dfa(df_2014_sum)
dfa_sum

In [None]:
m = dfa_sum.shape[0]
n = dfa_sum.shape[1]

In [None]:
k_scores ={}
make_elbow_graph(dfa_sum)
sns.lineplot(x=list(k_scores.keys()), y=list(k_scores.values()))

In [None]:
output = plot_graph(2, dfa_sum)

In [None]:
cluster_1 = []
cluster_2 = []
clusters = make_clusters_k(2, output, dfa_sum, df_sum)
clusters

In [None]:
make_map(clusters, "Summer")

In [None]:
cluster_labels = dbscan(dfa_sum, 1, 0.1)
cluster_labels

In [None]:
clusters_db = {1: [], 2: []}
for i in range(len(cluster_labels)):
    x = cluster_labels[i]
    clusters_db[x].append(df_sum.iloc[i]["State_Name"])
clusters_db

In [None]:
make_map(clusters_db, "Summer")

K means and DBSCAN give same result

# Winter Season

In [None]:
df_2014_win = df_2014[df_2014["Season"] == "Winter     "]
df_2014_win.head()

In [None]:
df_2014_win = df_2014_win.drop(["District_Name", "Crop_Year", "Season", "Crop"], axis=1) 
df_2014_win

In [None]:
dfa_win, df_win = make_dfa(df_2014_win)
df_win

In [None]:
m = dfa_win.shape[0]
n = dfa_win.shape[1]

In [None]:
k_scores ={}
make_elbow_graph(dfa_win)
sns.lineplot(x=list(k_scores.keys()), y=list(k_scores.values()))

In [None]:
output = plot_graph(2, dfa_win)

In [None]:
cluster_1 = []
cluster_2 = []
clusters = make_clusters_k(2, output, dfa_win, df_win)
clusters

In [None]:
make_map(clusters, "Winter")

In [None]:
cluster_labels = dbscan(dfa_win, 1, 0.08)
cluster_labels

In [None]:
clusters_db = {1: [], 2: []}
for i in range(len(cluster_labels)):
    x = cluster_labels[i]
    clusters_db[x].append(df_win.iloc[i]["State_Name"])
clusters_db

In [None]:
make_map(clusters_db, "Winter")

Here DBSCAN appears to do a better job, so we'll choose that over the output of K means

# Whole Year

In [None]:
df_2014_year = df_2014[df_2014["Season"] == "Whole Year "]
df_2014_year.head()

In [None]:
df_2014_year = df_2014_year.drop(["District_Name", "Crop_Year", "Season", "Crop"], axis=1) 
df_2014_year

In [None]:
dfa_year, df_year = make_dfa(df_2014_year)
df_year

In [None]:
m = dfa_year.shape[0]
n = dfa_year.shape[1]

In [None]:
k_scores ={}
make_elbow_graph(dfa_year)
sns.lineplot(x=list(k_scores.keys()), y=list(k_scores.values()))

In [None]:
output = plot_graph(2, dfa_year)

In [None]:
cluster_1 = []
cluster_2 = []
clusters = make_clusters_k(2, output, dfa_year, df_year)
clusters

In [None]:
make_map(clusters, "Whole Year")

In [None]:
cluster_labels = dbscan(dfa_year, 2, 0.06)
cluster_labels

In [None]:
clusters_db = {1: [], 2: []}
for i in range(len(cluster_labels)):
    x = cluster_labels[i]
    clusters_db[x].append(df_year.iloc[i]["State_Name"])
clusters_db

In [None]:
make_map(clusters_db, "Whole Year")

Same output from K means and DBSCAN