In [3]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.neighbors import LocalOutlierFactor
from scipy.stats import probplot
from scipy.stats import zscore

In [4]:
df = pd.read_csv("../waters_datasets/additional_datasets/wtrqlty_2008.csv", encoding= 'unicode_escape')
df["STATION CODE"]=df["STATION CODE"].astype(object)
df.shape


KeyError: 'STATION CODE'

In [195]:
# Converting object data type to numeric
def convert_to_numeric(df):
    num_col = df.shape[1]
    # Start from index 3
    for index in range(3, num_col):
        col_name = df.iloc[:, index].name
        df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
    return df

df = convert_to_numeric(df)
df.dtypes

STATION CODE     object
LOCATIONS        object
STATE            object
Temp            float64
DO              float64
PH              float64
Conductivity    float64
BOD             float64
NI              float64
Fec_col         float64
Tot_col         float64
dtype: object

In [196]:
# Replacing string NAN values with actual NAN value (np.nan)
def convert_to_nan(df):
    n_col = df.shape[1]
    for index in range(n_col):
        df.iloc[:, index]  = df.iloc[:, index].replace("NAN", np.nan)
    return df

df = convert_to_nan(df)
# Checking for missing values
df.isnull().sum().sort_values()


LOCATIONS       143
STATE           143
STATION CODE    144
PH              175
Conductivity    208
Temp            232
BOD             335
NI              402
Tot_col         421
Fec_col         527
DO              674
dtype: int64

In [197]:
df_num = df.select_dtypes(exclude="object")
df_num_col = df_num.columns
imputer = SimpleImputer(strategy="median")

df_num = imputer.fit_transform(df_num)
df_num = pd.DataFrame(df_num, columns=df_num_col)


df_cat = df.select_dtypes(include="object")
df_Concat= pd.concat([df_cat,df_num],axis=1)

df_null = df_Concat[(df_Concat["STATION CODE"].isnull()) & (df_Concat["LOCATIONS"].isnull()) & (df_Concat["STATE"].isnull())]
df_null_indices = df_null.index
df_Concat.drop(df_null_indices, axis=0, inplace=True)


df_Concat.to_csv("Mer.csv",index = False)
df_Concat.dtypes

STATION CODE     object
LOCATIONS        object
STATE            object
Temp            float64
DO              float64
PH              float64
Conductivity    float64
BOD             float64
NI              float64
Fec_col         float64
Tot_col         float64
dtype: object

In [198]:

ndf=pd.read_csv("Mer.csv")
ndf.dtypes

STATION CODE    float64
LOCATIONS        object
STATE            object
Temp            float64
DO              float64
PH              float64
Conductivity    float64
BOD             float64
NI              float64
Fec_col         float64
Tot_col         float64
dtype: object

In [199]:
ndf["STATION CODE"]=ndf["STATION CODE"].astype(object)

# Using Z-Score Normalization to detect outliers
df_num = ndf.select_dtypes(exclude="object")

df_num_norm = df_num.apply(zscore, axis=0)


def indices_of_greater_than_3(df_norm):
    indices_arr = []
    n_col = df_norm.shape[1]
    for index in range(n_col):
        col_index = df_norm.iloc[: ,index]
        greater_than_3 = df_norm[col_index > 3]
        greater_than_3_index = greater_than_3.index
        indices_arr.extend(greater_than_3_index)
    return indices_arr

indices_arr = indices_of_greater_than_3(df_num_norm)
print("Number of outliers using Z-Score method-",len(indices_arr))

Number of outliers using Z-Score method- 118


In [200]:
ndf.drop(indices_arr, axis=0, inplace=True)
ndf.shape

(1600, 11)

In [201]:

# Calculating Water Quality Index of each sample
df_num = ndf.select_dtypes(exclude="object")
# Dropping year and Temp attribute because they are not used for computing WQI
df_num.drop(["Temp"], axis=1, inplace=True)

# Weight Vector(wi)
wi = np.array([0.2213, 0.2604, 0.0022, 0.4426, 0.0492, 0.0221, 0.0022])

# Standard values of parameters(si)
si = np.array([10, 8.5, 1000, 5, 45, 100, 1000])

# Ideal values of paramters(vIdeal)
vIdeal = np.array([14.6, 7, 0, 0, 0, 0, 0])

def calc_wqi(sample):
    wqi_sample = 0
    wqi_w=0
    num_col = 7
    for index in range(num_col):
        v_index = sample[index] # Obeserved value of sample at index
        v_index_ideal = vIdeal[index] # Ideal value of obeserved value
        w_index = wi[index] # weight of corresponding parameter of obeserved value
        
        std_index = si[index] # Standard value recommended for obeserved value
        q_index = (v_index - v_index_ideal) / (std_index - v_index_ideal)
        q_index = q_index * 100 # Final qi value of obeserved value
        wqi_sample += ((q_index*w_index))
        wqi_w+=w_index
        
    return wqi_sample/wqi_w

def calc_wqi_for_df(df):
    wqi_arr = []
    for index in range(df.shape[0]):
        index_row = df.iloc[index, :]
        wqi_row = calc_wqi(index_row)
        wqi_arr.append(wqi_row)
    return wqi_arr

wqi_arr = calc_wqi_for_df(df_num)
# Converting oridnary array to numpy array
wqi_arr = np.array(wqi_arr)
wqi_arr = np.reshape(wqi_arr, (-1, 1))

# Resetting index values of the dataframes
wqi_arr_df = pd.DataFrame(wqi_arr, columns=["WQI"]).reset_index()
df = ndf.reset_index()

df_wqi = pd.concat([df, pd.DataFrame(wqi_arr, columns=["WQI"])], axis=1)
df_wqi.drop("index", axis=1, inplace=True)
df_wqi.shape

# Removing the samples with negative WQI
df_neg_indices = df_wqi[(df_wqi["WQI"] < 0)].index
df_wqi.drop(df_neg_indices, axis=0, inplace=True)
df_wqi.tail()




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,STATION CODE,LOCATIONS,STATE,Temp,DO,PH,Conductivity,BOD,NI,Fec_col,Tot_col,WQI
1595,1498.0,YAMUNA AT BATESWAR,UTTAR PRADESH,24.4,8.8,8.0,1019.0,12.2,0.79,83727.0,694091.0,2156.634717
1596,1127.0,YAMUNA AT ETAWAH,UTTAR PRADESH,23.2,9.6,8.0,1006.0,11.2,0.79,80018.0,824455.0,2090.642341
1597,1499.0,YAMUNA AT JUHIKA B/C WITH CHANBAL- ETAWAH,UTTAR PRADESH,23.3,10.5,8.0,623.0,4.2,0.79,193909.0,1349364.0,4656.735379
1598,2283.0,YAMUNA AT HAMIRPUR,UTTAR PRADESH,25.3,6.6,7.8,509.0,2.0,0.79,49050.0,782500.0,1326.43231
1599,1069.0,YAMUNA AT ALLAHABAD DOWNSTREAM (BALUA GHAT),UTTAR PRADESH,25.5,7.9,8.2,563.0,2.2,2.33,2250.0,3925.0,123.506333


In [206]:
df_null = df_wqi[(df_wqi["STATION CODE"].isnull())]
df_null_indices = df_null.index
df_wqi.drop(df_null_indices, axis=0, inplace=True)

df_wqi["WQI clf"] = df_wqi["WQI"].apply(lambda x: (3 if (x <= 25)  
                                        else(2 if (26<=x<=50) 
                                        else(1 if (51<=x<=75) 
                                        else 0))))


df_wqi.tail()

df_wqi.to_csv("Final.csv")

In [207]:
df_wqi.shape


(1598, 13)