In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.neighbors import LocalOutlierFactor
from scipy.stats import probplot
from scipy.stats import zscore

  import pandas.util.testing as tm


In [3]:
df= pd.read_csv("waterQ1.csv",encoding='unicode_escape')

In [4]:
# Calculating Water Quality Index of each sample
df_num_final = df.select_dtypes(exclude="object")
# Dropping year and Temp attribute because they are not used for computing WQI
df_num_final.drop(["year", "Temp"], axis=1, inplace=True)

# Weight Vector(wi)
wi = np.array([0.2213, 0.2604, 0.0022, 0.4426, 0.0492, 0.0221, 0.0022])

# Standard values of parameters(si)
si = np.array([10, 8.5, 1000, 5, 45, 100, 1000])

# Ideal values of paramters(vIdeal)
vIdeal = np.array([14.6, 7, 0, 0, 0, 0, 0])

def calc_wqi(sample):
    wqi_sample = 0
    num_col = 7
    for index in range(num_col):
        v_index = sample[index] # Obeserved value of sample at index
        v_index_ideal = vIdeal[index] # Ideal value of obeserved value
        w_index = wi[index] # weight of corresponding parameter of obeserved value
        std_index = si[index] # Standard value recommended for obeserved value
        q_index = (v_index - v_index_ideal) / (std_index - v_index_ideal)
        q_index = q_index * 100 # Final qi value of obeserved value
        wqi_sample += q_index*w_index
    return wqi_sample

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [5]:
# Computing WQI for the whole dataset
def calc_wqi_for_df(df):
    wqi_arr = []
    for index in range(df.shape[0]):
        index_row = df.iloc[index, :]
        wqi_row = calc_wqi(index_row)
        wqi_arr.append(wqi_row)
    return wqi_arr

In [6]:
wqi_arr = calc_wqi_for_df(df_num_final)
# Converting oridnary array to numpy array
wqi_arr = np.array(wqi_arr)
wqi_arr = np.reshape(wqi_arr, (-1, 1))

# Resetting index values of the dataframes
wqi_arr_df = pd.DataFrame(wqi_arr, columns=["WQI"]).reset_index()
df = df.reset_index()

In [7]:
df_wqi = pd.concat([df, pd.DataFrame(wqi_arr, columns=["WQI"])], axis=1)
df_wqi.drop("index", axis=1, inplace=True)
df_wqi.shape

(1815, 13)

In [8]:
# Removing the samples with negative WQI
df_neg_indices = df_wqi[(df_wqi["WQI"] < 0)].index
df_wqi.drop(df_neg_indices, axis=0, inplace=True)

In [9]:
df_wqi["WQI clf"] = df_wqi["WQI"].apply(lambda x: ("Good Quality" if (x <= 50)  
                                        else("Poor Quality" if (51<=x<=75) 
                                        else "Unsuitable"))))

In [10]:
df_wqi.tail()

Unnamed: 0,STATION CODE,LOCATIONS,STATE,Temp,D.O. (mg/l),PH,CONDUCTIVITY (Âµmhos/cm),B.O.D. (mg/l),NITRATENAN N+ NITRITENANN (mg/l),FECAL COLIFORM (MPN/100ml),TOTAL COLIFORM (MPN/100ml)Mean,year,WQI,WQI clf
1659,43,"MUVATTAPUZHA AT VETTIKKATTUMUKKU, KERALA",KERALA,27.0,7.2,6.76,59.8,1.692,0.449,127.0,357.0,2005.0,396.417246,0
1681,1092,"KALU AT ATALE VILLAGE, MAHARASHTRA",MAHARASHTRA,27.0,5.1,7.45,5168.8,9.875,1.81,7.0,210.0,2005.0,40539.125577,0
1755,17,"PERIYAR AT ALWAYE, KERALA",KERALA,28.0,6.4,1579.0,5.8,1.7,0.513,0.513,959.0,2003.0,29.92421,2
1756,18,"PERIYAR AT KALADY, KERALA",KERALA,27.0,6.8,62.0,7.5,1.5,0.221,0.221,892.0,2003.0,46.743616,2
1758,20,"CHALIYAR AT KOOLIMADU, KERALA",KERALA,28.0,6.9,62.0,6.8,0.4,0.21,0.21,802.0,2003.0,32.540965,2


In [11]:
df_wqi.to_csv('waterQ1(withWQI).csv', index=False)

In [12]:
df2= pd.read_csv("waterQ1(withWQI).csv",encoding='unicode_escape')