In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns',None)
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import sklearn.metrics
import math
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

: 

In [None]:
data2018=pd.read_csv("ground_water_quality_2018_post.csv")
data2019=pd.read_csv("ground_water_quality_2019_post.csv")
data2020=pd.read_csv("ground_water_quality_2020_post.csv")

: 

In [None]:
data2018.shape

: 

In [None]:
data2018.isnull().sum()

: 

In [None]:
data2019.isnull().sum()

: 

In [None]:
data2020.isnull().sum()

: 

In [None]:
data2020.drop("Unnamed: 8",axis=1, inplace=True)

: 

In [None]:
column_name_mappings={'CO_-2 ':'CO3', 'HCO_ - ':'HCO3', 'Cl -':'Cl', 'F -':'F','NO3- ':'NO3 ', 'SO4-2':'SO4', 'Na+':'Na', 'K+':'K', 'Ca+2':'Ca', 'Mg+2':'Mg','EC':'E.C'
                     }
data2019.rename(columns=column_name_mappings,inplace=True)

: 

In [None]:
data2019.info()

: 

In [None]:
median_groundwater_by_district_2018 = data2018.groupby('district')['gwl'].median()
median_groundwater_by_district_2019 = data2019.groupby('district')['gwl'].median()
median_groundwater_by_district_2020 = data2020.groupby('district')['gwl'].median()

: 

In [None]:
def impute_missing_with_median(row):
    if pd.isnull(row['gwl']):
        return median_groundwater_by_district_2019[row['district']]
    else:
        return row['gwl']


data2018['gwl'] = data2018.apply(impute_missing_with_median, axis=1)
data2019['gwl'] = data2019.apply(impute_missing_with_median, axis=1)
data2020['gwl'] = data2020.apply(impute_missing_with_median, axis=1)

: 

In [None]:
missing2019=data2019[data2019['CO3'].isnull()]
missing2019['district'].value_counts()

: 

In [None]:
df_missing=data2019[['pH','CO3']]
df_subset = df_missing.dropna(subset=['CO3', 'pH'])

: 

In [None]:
#taking X and y for prediction
X_train = df_subset['pH'].values.reshape(-1, 1)
y_train = df_subset['CO3'].values

: 

In [None]:
from sklearn.linear_model import LinearRegression

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

: 

In [None]:
X_missing_values = df_missing.loc[df_missing['CO3'].isnull(), 'pH'].values.reshape(-1, 1)

# Predict the missing values
predicted_missing_values = model.predict(X_missing_values)

# Update the missing values in the original dataset with the predicted values
df_missing.loc[df_missing['CO3'].isnull(), 'CO3'] = predicted_missing_values

: 

In [None]:
data2019['CO3']=df_missing['CO3']

: 

In [None]:
missing2019=data2019[data2019['CO3'].isnull()]
missing2019['district'].value_counts()

: 

In [None]:
water_data=pd.concat([data2018,data2019,data2020])
water_data.to_csv('cleaned_ground_water_data.csv', index=False)

: 

In [None]:
water_data.loc[261,'pH']=8.05
water_data['pH']=water_data.loc[:,'pH'].astype(float)

: 

In [None]:
Columns_to_drop=['sno', 'district', 'mandal', 'village', 'lat_gis', 'long_gis','season','RSC  meq  / L','Classification.1']
Water_data=water_data.drop(columns=Columns_to_drop,inplace=True)

: 

In [None]:
water_data.head()

: 

In [None]:
water_data['Classification'].value_counts()

: 

In [None]:
water_data = water_data[~water_data['Classification'].isin(['C3S4', 'C2S2','OG','O.G'])]

: 

In [None]:
#splitting train and test by startified
X=water_data.drop(['Classification'],axis=1)
y=water_data['Classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=2)

: 

In [None]:
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

: 

In [None]:
#pca
pca = PCA(n_components=12)
X_train_pca = pca.fit_transform(X_train_normalized)
X_test_pca = pca.transform(X_test_normalized)

: 

In [None]:
pca.explained_variance_ratio_

: 

In [None]:
sum(pca.explained_variance_ratio_)

: 

In [None]:
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

: 

In [None]:
#best model = XGBoost

best_model = xgb.XGBClassifier(n_estimators=1000, max_depth=5, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8)  # Replace with the selected best model
best_model.fit(X_train_pca, y_train_encoded)

#Evaluate the final model on the test set

y_test_pred = best_model.predict(X_test_pca)
test_accuracy = accuracy_score(y_test_encoded, y_test_pred)
print(f"\nFinal Model - Test Accuracy: {test_accuracy:.4f}")

: 

In [None]:
import pickle
with open('WaterQualityClassification.pkl', 'wb') as file:
    pickle.dump(best_model, file)

: 

In [None]:
encoded_classes = encoder.classes_

for i, class_label in enumerate(encoded_classes):
    print(f"Class Label: {class_label} - Encoded Value: {i}")

: 

In [None]:
def prediction(data):
    with open('WaterQualityClassification.pkl', 'rb') as file:
        loaded_model = pickle.load(file)

    new_data=np.array(data)
    new_data_normalised=scaler.transform(new_data.reshape(1,-1))
    new_data_pca=pca.transform(new_data_normalised)
    predicted_class = loaded_model.predict(new_data_pca)

    encoded_class=predicted_class
    original_class=encoder.inverse_transform(encoded_class)
    Class_Description={'C1S1':"Low salinity and low sodium waters are good for irrigation and can be used with most crops with no restriction on use on most of the soils. ",
                   'C2S1':"Medium salinity and low sodium waters are good for irrigation and can be used on all most all soils with little danger of development of harmful levels of exchangeable sodium if a moderate amount of leaching occurs. Crops can be grown without any special consideration for salinity control. ",
                   'C3S1':"The high salinity and low sodium waters require good drainage. Crops with good salt tolerance should be selected.",
                  'C3S2':"The high salinity and medium sodium waters require good drainage and can be used on coarse - textured or organic soils having good permeability. ",
                  'C3S3':"These high salinity and high sodium waters require special soil management, good drainage, high leaching and organic matter additions. Gypsum amendments make feasible the use of these waters. ",
                  'C4S1':"Very high salinity and low sodium waters are not suitable for irrigation unless the soil must be permeable and drainage must be adequate. Irrigation waters must be applied in excess to provide considerable leaching. Salt tolerant crops must be selected. ",
                  'C4S2':"Very high salinity and medium sodium waters are not suitable for irrigation on fine textured soils and low leaching conditions and can be used for irrigation on coarse textured or organic soils having good permeability. ",
                  'C4S3':"Very high salinity and high sodium waters produce harmful levels of exchangeable sodium in most soils and will require special soil management, good drainage, high leaching, and organic matter additions. The Gypsum amendment makes feasible the use of these waters. ",
                       'C4S4':"Very high salinity and very high sodium waters are generally unsuitable for irrigation purposes. These are sodium chloride types of water and can cause sodium hazards. It can be used on coarse-textured soils with very good drainage for very high salt tolerant crops. Gypsum amendments make feasible the use of these waters. "
                  }

    if original_class[0] in Class_Description:
        print(original_class[0]+" "+Class_Description[original_class[0]])

: 

In [None]:
test_data=[5.66,8.02,420,275.3,0.0,153.3,12,0.55,11.200,37,25,1.0,33.0,20.5,160.3,0.938]
prediction(test_data)

: 

In [None]:
acc_lr = accuracy_score(y_test_encoded, y_test_pred)
conf = confusion_matrix(y_test_encoded, y_test_pred)
clf_report = classification_report(y_test_encoded, y_test_pred)
print(f"Accuracy Score of Logistic Regression is : {acc_lr}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

: 

: 