In [1]:
# "Let's import the necessary libraries now; we will import others as needed.
import pandas as pd
import numpy as np
import warnings

In [2]:
## Let's disable warnings
warnings.filterwarnings('ignore')

In [3]:
## Let's load our dataset as a pandas DataFrame
data = pd.read_csv('water_potability.csv')

### Find Shape of Our Dataset (Number of Rows And Number of Columns)

In [4]:
# # Let's check the shape of our dataset, 
# which means the number of rows and columns available in the dataset
data.shape

(3276, 10)

### Get Information About Our Dataset Like Total Number Rows, Total Number of Columns, Datatypes of Each Column And Memory Requirement

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [6]:
# # Generate descriptive statistics of the dataset
data.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


### Check Null Values In The Dataset

In [7]:
# # Check for missing values in the dataset and sum them up
data.isnull().sum()
# This will output the total number of missing values for each column in the DataFrame.
# ph: 491 missing values
# Sulfate: 781 missing values
# Trihalomethanes: 162 missing values
# The rest of the columns have no missing values.

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [8]:
#  Calculate the percentage of missing values in each column
per = data.isnull().sum() / len(data) *100
print(per)
# This output presents the percentage of missing values in each column of your dataset:

# ph: Approximately 14.99% missing values
# Sulfate: Approximately 23.84% missing values
# Trihalomethanes: Approximately 4.95% missing values

ph                 14.987790
Hardness            0.000000
Solids              0.000000
Chloramines         0.000000
Sulfate            23.840049
Conductivity        0.000000
Organic_carbon      0.000000
Trihalomethanes     4.945055
Turbidity           0.000000
Potability          0.000000
dtype: float64


In [9]:
# Let's splits our dataset into features (X) and the target variable (y).
X = data.drop(columns = ['Potability']) # independent variables
y  = data['Potability'] # dependent variable
# This separation is commonly done when preparing data for machine learning tasks, 
# where X represents the input features and y represents the target variable to be predicted.

### Splitting The Dataset Into The Training Set And Test Set

In [10]:
# to evaluate the performance of machine learning models on unseen data.
from sklearn.model_selection import train_test_split

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

In [12]:
# Now let’s build machine learning pipeline.
# Pipeline is a tool that allows us to streamline and simplify our machine learning workflow.
# In simple term,
# A pipeline in machine learning is like a pipe that connects different stages together.
# Allowing data to flow through each stage in a sequence.
# Means output of first stage becomes the input of the next stage in the pipeline. -->


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [15]:
# scikit-learn pipeline, which consists of two steps:

# Imputer: This step fills missing values in the dataset. 
# It uses SimpleImputer with the strategy of replacing missing values with the median of each column.

# Scaler: This step scales the features to have zero mean 
# and unit variance. It uses StandardScaler for standardization.

In [16]:
pipe = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scalar',StandardScaler())
])

In [17]:
# let's define a ColumnTransformer, which is used to apply different transformations 
# to different columns in your dataset. In this case, 
# you're applying the pipeline (pipe) defined earlier to all columns.

In [18]:
preprocessor = ColumnTransformer(transformers=[
    ("columns", pipe,slice(None) )
])

# ("columns", pipe, slice(None)): This tuple specifies the transformation to apply to the columns.

# "columns": This is the name or identifier for this transformation.
# pipe: This is the pipeline defined earlier, which consists of the imputer and scaler steps.
# slice(None): This indicates that the transformation should be applied to all columns.

### Training the model

In [19]:
from sklearn.ensemble import RandomForestClassifier


In [20]:
classifier = RandomForestClassifier()


In [21]:
# # Define a pipeline for the model, combining preprocessing and classification
model = Pipeline(steps=[
    ("pre",preprocessor),
    ("model",classifier)
])
# "pre", preprocessor): This tuple represents the preprocessing step.

# "pre": This is the name or identifier for the preprocessing step.
# preprocessor: This is the preprocessor defined earlier, which includes imputation and scaling.
# ("model", classifier): This tuple represents the classifier step.

# "model": This is the name or identifier for the classifier step.
# classifier: This is the classifier object 
# (in this case, CatBoostClassifier) that you want to include in the pipeline.

In [22]:
# Let's train our created  pipeline model (model) to the training data (X_train and y_train). 
model.fit(X_train,y_train)

In [23]:
y_pred = model.predict(X_test) # unseen samples

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
accuracy_score(y_test,y_pred)

0.698170731707317

In [26]:
sample = pd.DataFrame({
   'Hardness':204.89045,
   'Solids':20791.318981,
    'Chloramines':7.300212,
    'Conductivity':564.308654,
    'Organic_carbon':10.379783,
    'Turbidity':2.963135,
    'ph':3.71608,
    'Sulfate':368.516441,
    'Trihalomethanes':86.99097
},index=[0])

In [27]:
# Make predictions
result = model.predict(sample)
# Print result
if result == 1:
    print("Water is Consumable")
else:
    print("Water is not consumable")

Water is not consumable


# Save The Model

In [28]:
import pickle

In [29]:
# Save the model using pickle
with open('randomforest_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [30]:
# Load the RandomForest model once when the application starts
with open('randomforest_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [31]:
model.predict(sample)

array([0], dtype=int64)