In [1]:
# Importing Libraries
import pandas as pd  # Dataframe Manipulation  
import numpy as np  # Array/lists Handlings
import matplotlib.pyplot as plt  # Data Visualization
import seaborn as sns  # For data visualization
from pandas.api.types import is_numeric_dtype
import joblib 

In [2]:
# Importing Dataset
df = pd.read_csv("Water data.csv")


In [3]:
# Data Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719998 entries, 0 to 719997
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   pH                      706067 non-null  float64
 1   Iron                    715157 non-null  float64
 2   Nitrate                 707167 non-null  float64
 3   Chloride                698816 non-null  float64
 4   Lead                    716787 non-null  float64
 5   Zinc                    701055 non-null  float64
 6   Color                   719321 non-null  object 
 7   Turbidity               714055 non-null  float64
 8   Fluoride                697080 non-null  float64
 9   Copper                  696023 non-null  float64
 10  Odor                    698473 non-null  float64
 11  Sulfate                 696240 non-null  float64
 12  Conductivity            700183 non-null  float64
 13  Chlorine                713008 non-null  float64
 14  Manganese           

In [4]:
# Describing the dataset
df.describe()
df.isna().sum()
df.dtypes


pH                        float64
Iron                      float64
Nitrate                   float64
Chloride                  float64
Lead                      float64
Zinc                      float64
Color                      object
Turbidity                 float64
Fluoride                  float64
Copper                    float64
Odor                      float64
Sulfate                   float64
Conductivity              float64
Chlorine                  float64
Manganese                 float64
Total Dissolved Solids    float64
Source                     object
Water Temperature         float64
Air Temperature           float64
Month                      object
Day                       float64
Time of Day               float64
Target                      int64
dtype: object

In [5]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder


In [6]:
# Initialize encoder
encoder = LabelEncoder()

In [7]:
# Apply encoding
df["Source"] = encoder.fit_transform(df["Source"])  
df["Color"] = encoder.fit_transform(df["Color"])  
df["Month"] = encoder.fit_transform(df["Month"]) 


In [8]:
# Check unique values after encoding
print(df["Source"].unique())
print(df["Month"].unique())
print(df["Color"].unique())

[8 2 4 1 5 6 0 3 7]
[ 4  9  0  6  7 11  8  5  1 10  2  3 12]
[0 1 2 3 4 5]


In [9]:
# Handling Missing values
df.isna().sum()

pH                        13931
Iron                       4841
Nitrate                   12831
Chloride                  21182
Lead                       3211
Zinc                      18943
Color                         0
Turbidity                  5943
Fluoride                  22918
Copper                    23975
Odor                      21525
Sulfate                   23758
Conductivity              19815
Chlorine                   6990
Manganese                 13277
Total Dissolved Solids      199
Source                        0
Water Temperature         20352
Air Temperature            3613
Month                         0
Day                       12146
Time of Day               14012
Target                        0
dtype: int64

In [10]:
# Data Cleaning
numeric_columns = []
for i in df.columns:
    if is_numeric_dtype(df[i]):
        numeric_columns.append(i)

In [11]:
for i in numeric_columns:
    if -0.5 < df[i].skew() < 0.5:
        df.fillna(df[i].mean(), inplace=True)
    else:
        df.fillna(df[i].median(), inplace=True)

In [12]:

print(numeric_columns)
df.isna().sum()
df.head()

['pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc', 'Color', 'Turbidity', 'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity', 'Chlorine', 'Manganese', 'Total Dissolved Solids', 'Source', 'Water Temperature', 'Air Temperature', 'Month', 'Day', 'Time of Day', 'Target']


Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,8.33,0.0,8.61,122.8,0.0,3.43,0,0.02,0.61,0.14,...,3.71,0.0,332.12,8,7.448288,43.49,4,29.0,4.0,0
1,6.92,0.0,3.73,227.03,0.0,1.25,1,0.02,0.62,0.44,...,3.29,0.0,284.64,2,15.35,71.22,9,26.0,16.0,0
2,5.44,0.02,3.82,231.0,0.0,0.53,2,0.32,0.42,0.43,...,3.56,0.07,570.05,4,11.64,44.89,4,31.0,8.0,0
3,7.96,0.14,8.22,178.13,0.0,4.03,3,0.17,0.21,0.24,...,3.52,0.02,100.04,1,10.09,60.84,0,1.0,21.0,0
4,8.09,0.0,9.93,186.54,0.0,3.81,2,0.0,0.22,0.62,...,3.18,0.0,168.08,5,15.25,69.34,6,29.0,7.0,0


In [13]:
# Exporting the preprocessed data to a new CSV file
df.to_csv("cleaned_data.csv", index=False)