In [None]:
import numpy as np
import pandas as pd
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("Ground_Water_Data\\ground_water_quality_in_assam-2014.csv",
                encoding = "unicode_escape")
data.head(2)


In [None]:
values = data.drop(["STATION CODE","LOCATIONS","STATE"],axis =1)
values.head()

In [None]:
corr = values.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(20,20))
g = sns.heatmap(corr,
            vmin=-1,
            cmap='coolwarm',
            annot=True,
            mask = mask);
plt.savefig("corr.png")

As is obvious from the given correlation graph, we see that the features are quiet independent of each other, barring the ones that desciribe the same variable. By performing PCA, we will be able to condense the information given to us into fewer meaningful variables.

Now, we'll be performing PCA on all of the data.

# Data Preprocessing - Principal Component Analysis
Principal component analysis (PCA) is a statistical procedure which converts a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components..<br><br>
In this notebook, we use PCA to reduce the dimensions of the ground water dataset.<br>
We start by importing the necessary libraries and loading the dataset into the notebook.

In [1]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv("Ground_Water_Data\ground_water_quality_in_kerala-2014.csv",
                encoding = "unicode_escape")
df.head()

Unnamed: 0,STATION CODE,LOCATIONS,STATE,TEMPERATURE ºC : Min,TEMPERATURE ºC : Max,TEMPERATURE ºC : Mean,pH : Min : 6.5-8.5,pH : Max : 6.5-8.5,pH : Mean : 6.5-8.5,CONDUCTIVITY (µmhos/cm) : Min,...,B.O.D. (mg/l) : Mean : < 3 mg/l,NITRATE- N+ NITRITE-N (mg/l) : Min,NITRATE- N+ NITRITE-N (mg/l) : Max,NITRATE- N+ NITRITE-N (mg/l) : Mean,FECAL COLIFORM (MPN/100ml) : Min : < 2500 MPN/100ml,FECAL COLIFORM (MPN/100ml) : Max : < 2500 MPN/100ml,FECAL COLIFORM (MPN/100ml) : Mean : < 2500 MPN/100ml,TOTAL COLIFORM (MPN/100ml) : Min : < 5000 MPN/100ml,TOTAL COLIFORM (MPN/100ml) : Max : < 5000 MPN/100ml,TOTAL COLIFORM (MPN/100ml) : Mean : < 5000 MPN/100ml
0,19,"WELL AT ELOOR, KERALA",Kerala,27.4,27.5,27.5,5.2,7.2,6.2,137,...,1.0,1.96,3.31,2.6,79,220,150,110,630,370
1,22,"WELL AT CHUNGAPALLY, KERALA",Kerala,24.0,26.0,25.0,5.7,7.7,6.7,103,...,0.4,0.42,1.3,0.9,17,350,184,22,430,226
2,35,"WELL AT PUNALUR, KERALA",Kerala,24.0,24.0,24.0,4.8,7.2,6.0,170,...,0.6,0.81,0.84,0.8,43,280,162,110,430,270
3,1581,"PAPPANAMKODE, THIRUVANANTHAPURAM, KERALA",Kerala,25.0,26.5,25.8,6.0,6.8,6.4,296,...,1.3,0.0,2.3,1.2,79,110,95,220,350,285
4,1582,"NEDUMANGAD, THIRUVANANTHAPURAM, KERALA",Kerala,25.0,26.5,25.8,7.0,7.4,7.2,150,...,4.0,1.2,4.2,2.7,43,94,69,180,240,210


Only features on which it is meaningful to use PCA on are selected.

In [8]:
data = df.drop(['STATION CODE','LOCATIONS','STATE'], axis=1)
data.head()


Unnamed: 0,TEMPERATURE ºC : Min,TEMPERATURE ºC : Max,TEMPERATURE ºC : Mean,pH : Min : 6.5-8.5,pH : Max : 6.5-8.5,pH : Mean : 6.5-8.5,CONDUCTIVITY (µmhos/cm) : Min,CONDUCTIVITY (µmhos/cm) : Max,CONDUCTIVITY (µmhos/cm) : Mean,B.O.D. (mg/l) : Min : < 3 mg/l,...,B.O.D. (mg/l) : Mean : < 3 mg/l,NITRATE- N+ NITRITE-N (mg/l) : Min,NITRATE- N+ NITRITE-N (mg/l) : Max,NITRATE- N+ NITRITE-N (mg/l) : Mean,FECAL COLIFORM (MPN/100ml) : Min : < 2500 MPN/100ml,FECAL COLIFORM (MPN/100ml) : Max : < 2500 MPN/100ml,FECAL COLIFORM (MPN/100ml) : Mean : < 2500 MPN/100ml,TOTAL COLIFORM (MPN/100ml) : Min : < 5000 MPN/100ml,TOTAL COLIFORM (MPN/100ml) : Max : < 5000 MPN/100ml,TOTAL COLIFORM (MPN/100ml) : Mean : < 5000 MPN/100ml
0,27.4,27.5,27.5,5.2,7.2,6.2,137,180,159,0.8,...,1.0,1.96,3.31,2.6,79,220,150,110,630,370
1,24.0,26.0,25.0,5.7,7.7,6.7,103,125,114,0.2,...,0.4,0.42,1.3,0.9,17,350,184,22,430,226
2,24.0,24.0,24.0,4.8,7.2,6.0,170,220,195,0.6,...,0.6,0.81,0.84,0.8,43,280,162,110,430,270
3,25.0,26.5,25.8,6.0,6.8,6.4,296,412,354,1.0,...,1.3,0.0,2.3,1.2,79,110,95,220,350,285
4,25.0,26.5,25.8,7.0,7.4,7.2,150,170,160,1.0,...,4.0,1.2,4.2,2.7,43,94,69,180,240,210


##  Normalising the data -

In [9]:
#we implement normalisation using the z metric using the value (x-u)/sigma
def normalize_data(data):
    mu = np.mean(data, 0)
    sigma = np.std(data, 0)
    data_norm = (data-mu)/sigma
    return data_norm, mu, sigma

In [10]:

data_norm, mu, sigma = normalize_data(data)
data_norm.head()


Unnamed: 0,TEMPERATURE ºC : Min,TEMPERATURE ºC : Max,TEMPERATURE ºC : Mean,pH : Min : 6.5-8.5,pH : Max : 6.5-8.5,pH : Mean : 6.5-8.5,CONDUCTIVITY (µmhos/cm) : Min,CONDUCTIVITY (µmhos/cm) : Max,CONDUCTIVITY (µmhos/cm) : Mean,B.O.D. (mg/l) : Min : < 3 mg/l,...,B.O.D. (mg/l) : Mean : < 3 mg/l,NITRATE- N+ NITRITE-N (mg/l) : Min,NITRATE- N+ NITRITE-N (mg/l) : Max,NITRATE- N+ NITRITE-N (mg/l) : Mean,FECAL COLIFORM (MPN/100ml) : Min : < 2500 MPN/100ml,FECAL COLIFORM (MPN/100ml) : Max : < 2500 MPN/100ml,FECAL COLIFORM (MPN/100ml) : Mean : < 2500 MPN/100ml,TOTAL COLIFORM (MPN/100ml) : Min : < 5000 MPN/100ml,TOTAL COLIFORM (MPN/100ml) : Max : < 5000 MPN/100ml,TOTAL COLIFORM (MPN/100ml) : Mean : < 5000 MPN/100ml
0,0.272599,-0.221863,0.031817,-0.852026,0.314506,-0.414531,-0.481964,-0.507795,-0.508453,0.254982,...,-0.184101,0.211863,-0.142137,-0.043102,0.042153,-0.15607,-0.111568,-0.318263,0.065824,-0.04157
1,-1.734561,-1.007629,-1.391577,-0.349094,0.999967,0.214668,-0.724098,-0.750344,-0.759723,-1.422965,...,-0.252015,-0.801208,-0.825714,-0.902175,-0.477486,0.221506,0.045715,-0.670423,-0.233175,-0.376871
2,-1.734561,-2.055316,-1.960935,-1.254372,0.314506,-0.66621,-0.246952,-0.331396,-0.307437,-0.304334,...,-0.229377,-0.544651,-0.982155,-0.952708,-0.259573,0.018195,-0.056056,-0.318263,-0.233175,-0.274418
3,-1.14422,-0.745707,-0.936091,-0.047335,-0.233863,-0.162851,0.650369,0.515318,0.580385,0.814298,...,-0.150144,-1.0775,-0.485626,-0.750574,0.042153,-0.475558,-0.365996,0.121938,-0.352775,-0.239491
4,-1.14422,-0.745707,-0.936091,0.95853,0.588691,0.843866,-0.389384,-0.551895,-0.502869,0.814298,...,0.15547,-0.288094,0.160542,0.007431,-0.259573,-0.522029,-0.486271,-0.038135,-0.517224,-0.414127


## PCA comprises of the following  steps - 
1) Calculate the covariance matrix<br> 
2) Obtain the eigenvectors and eigenvalues corresponding to the covariance matrix<br>
3) Using the eigenvectors correspongind to the k largest eigenvaleus to construct the principal components <br>
4) Project the data onto the principal components to get the compressed dataset<br>

## Step 1: Calculate the covariance matrix -

In [11]:
cov_mat = np.cov(data_norm.transpose())
print(cov_mat.shape)
#print(cov_mat)

(21, 21)


## Step 2: Compute eigenvectors and eigenvalues of the covariance matrix - 

In [12]:
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
print(eig_vecs.shape)
#print(eig_vals)
#print(eig_vecs)

(21, 21)


## Step 3: Construct the principal components

To choose the number of eigenvectors we are using, we  can utilize a scree plot.<br>
For the other datasets, eigenvectors corresponding to eigenvalues greater than will be chosen.

In [13]:
index = np.arange(1,eig_vals.shape[0]+1)
import seaborn as sns
sns.pointplot(x=index, y=eig_vals).set_title("Scree Plot")

Text(0.5, 1.0, 'Scree Plot')

An elbow is seen at 6  components on the x-axis. Using 6 components, the explained variance is:

In [14]:
print(np.sum(eig_vals[0:6]/np.sum(eig_vals)))

0.901280265173478


In [15]:
print(np.sum(eig_vals>1)*1)

6


Making a matrix of only the first 6 eigenvectors

In [16]:
eig_vecs = eig_vecs[:,0:6]
eig_vecs.shape

(21, 6)

## Step 5: Project the data onto the new compressed space

In [17]:
Y_pca = data_norm.dot(eig_vecs)
print(data_norm.shape)

(34, 21)


As we can see, we have transformed the original dataset which was 34x21 to one which is 34x6!

For convenience the following function will be used to perform PCA on the data of the remaining states -

In [18]:
 def pca(data):
    data_norm,mu,sigma = normalize_data(data)
    cov_mat = np.cov(data_norm.transpose())
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    n = np.sum(eig_vals>1)*1

    
    eig_vecs = eig_vecs[:,0:n]
    Y_pca = data_norm.dot(eig_vecs)
    return Y_pca

In [19]:
pca(data_norm)

Unnamed: 0,0,1,2,3,4,5
0,-0.002517,0.886812,-0.159971,-0.239243,-0.446805,-0.183853
1,1.144131,1.972697,2.223529,0.453587,-0.450619,0.082444
2,1.123785,2.709839,2.107992,0.269,1.16308,0.725423
3,1.31399,0.353769,1.175282,0.564365,1.516171,0.582125
4,1.303213,0.598026,1.014326,-0.149687,0.588747,-1.76633
5,2.283585,5.015117,0.815366,0.00726,1.610989,2.504142
6,1.074414,-1.125584,1.303536,1.923,-2.046681,-0.704203
7,0.252157,-2.95861,-0.445071,-1.32963,0.600681,-1.814888
8,0.761869,0.324315,-1.832895,-2.582913,0.366297,0.836919
9,-1.029713,-0.977665,-1.565111,-3.989032,0.058247,-1.54417


Now, to perform PCA on the entire dataset

In [22]:
data = pd.read_csv("Ground_water_pre_processed\filled_groundwater.csv")

FileNotFoundError: [Errno 2] File b'Ground_water_pre_processed\x0cilled_groundwater.csv' does not exist: b'Ground_water_pre_processed\x0cilled_groundwater.csv'