## Setup

In [1]:
from dotenv import load_dotenv
import matplotlib.pyplot as plt

load_dotenv()  # Defines KAGGLE_USERNAME and KAGGLE_KEY
%matplotlib inline
print('Setup complete')

Setup complete


## Download Data

In [4]:
import kaggle
import os

api = kaggle.api
# Kaggle requires an authentication token to access the API
# Go to your Kaggle account settings to generate an API token, which will be downloaded as a kaggle.json file
# Create a .env file and define the following environment variables...
# KAGGLE_USERNAME="your_username"
# KAGGLE_KEY="your_key"

if not os.path.isdir('./data'):
    print('Dowloading dataset...')
    api.dataset_download_files('nayanack/water-probability', path='./data', unzip=True)
    print('Complete')
else:
    print('Dataset already downloaded')

Dowloading dataset...
Dataset URL: https://www.kaggle.com/datasets/nayanack/water-probability
Complete


## Load Data

In [11]:
import pandas as pd

DATASET_DIR = '.\data\water_potability.csv'
potability_data = pd.read_csv(DATASET_DIR)
potability_data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [12]:
potability_data.dtypes

ph                 float64
Hardness           float64
Solids             float64
Chloramines        float64
Sulfate            float64
Conductivity       float64
Organic_carbon     float64
Trihalomethanes    float64
Turbidity          float64
Potability           int64
dtype: object

In [13]:
n = len(potability_data)
print(f'Number of entries: {n}')
print(f'Dataframe shape: {potability_data.shape}')

Number of entries: 3276
Dataframe shape: (3276, 10)


## Clean Data

In [14]:
# Count the number of missing values in each column
potability_data.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [15]:
# Remove rows with missing values
potability_data = potability_data.dropna()
n = len(potability_data)
print(f'Number of rows: {n}')
print(f'Dataframe shape: {potability_data.shape}')
potability_data.head()

Number of rows: 2011
Dataframe shape: (2011, 10)


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0
5,5.584087,188.313324,28748.687739,7.544869,326.678363,280.467916,8.399735,54.917862,2.559708,0
6,10.223862,248.071735,28749.716544,7.513408,393.663396,283.651634,13.789695,84.603556,2.672989,0
7,8.635849,203.361523,13672.091764,4.563009,303.309771,474.607645,12.363817,62.798309,4.401425,0


## Split Data

In [17]:
# Train-Test split 80-20
# Separate features from labels
from sklearn.model_selection import train_test_split

X = potability_data.drop(columns='Potability')
y = potability_data['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
train_n = len(X_train)
test_n = len(X_test)
print(f'Training rows: {train_n}')
print(f'Test rows: {test_n}')
print(f'Train X data shape: {X_train.shape}')
print(f'Train y data shape: {y_train.shape}')
print(f'Test X data shape: {X_test.shape}')
print(f'Test y data shape: {y_test.shape}')
X_train.head()

Training rows: 1608
Test rows: 403
Train X data shape: (1608, 9)
Train y data shape: (1608,)
Test X data shape: (403, 9)
Test y data shape: (403,)


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
2155,4.981672,220.950558,20988.087806,7.985668,232.548814,373.624878,12.235175,45.95003,4.475148
3045,5.622807,194.463239,16119.34019,6.907987,336.648207,408.459081,16.364724,76.631186,3.992957
1550,8.416087,208.326022,28234.906857,7.784123,297.330528,444.560108,12.645176,78.756533,4.930084
1824,6.980727,200.467226,30862.784459,6.196276,391.666878,422.489596,9.066902,48.250468,5.437619
3244,5.836105,277.065713,17711.487774,3.458192,400.167599,456.732862,17.552294,72.059866,3.738991
