In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# Load the dataset
data = pd.read_csv('water_potability.csv')

# Handle missing values (fill with mean or drop as per requirement)
data = data.fillna(data.mean())

# Feature scaling (excluding the target 'Potability')
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data.drop('Potability', axis=1))

# Define X and y
X = scaled_data
y = data['Potability'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42, n_estimators=100)
rf_regressor.fit(X_train, y_train)

# Make predictions
y_train_pred = rf_regressor.predict(X_train)
y_test_pred = rf_regressor.predict(X_test)

# Evaluate the model
train_score = r2_score(y_train, y_train_pred)  # R-squared score on the training set
test_score = r2_score(y_test, y_test_pred)     # R-squared score on the test set
mse_test = mean_squared_error(y_test, y_test_pred)

# Convert predictions to binary for accuracy score calculation
y_test_pred_binary = np.round(y_test_pred)
accuracy = accuracy_score(y_test, y_test_pred_binary)

# Output results
print("Training Score (R-squared):", train_score)
print("Test Score (R-squared):", test_score)
print("Mean Squared Error on Test Set:", mse_test)
print("Accuracy (rounded binary predictions):", accuracy)


Training Score (R-squared): 0.8754186584256344
Test Score (R-squared): 0.11120509310838789
Mean Squared Error on Test Set: 0.20762560975609753
Accuracy (rounded binary predictions): 0.6798780487804879


In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import gradio as gr

# Load the dataset and prepare the model
data = pd.read_csv('water_potability.csv')
X = data.drop(columns="Potability")
y = data["Potability"]

pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")), 
    ("classifier", RandomForestClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

def predict_potability(ph, hardness, solids, chloramines, sulfate, conductivity, organic_carbon, trihalomethanes, turbidity):
    input_data = pd.DataFrame([[ph, hardness, solids, chloramines, sulfate, conductivity, organic_carbon, trihalomethanes, turbidity]], columns=X.columns)
    prediction = pipeline.predict(input_data)[0]
    return "Potable" if prediction == 1 else "Not Potable"

input_components = [
    gr.inputs.Number(label="pH", default=None),
    gr.inputs.Number(label="Hardness", default=None),
    gr.inputs.Number(label="Solids", default=None),
    gr.inputs.Number(label="Chloramines", default=None),
    gr.inputs.Number(label="Sulfate", default=None),
    gr.inputs.Number(label="Conductivity", default=None),
    gr.inputs.Number(label="Organic Carbon", default=None),
    gr.inputs.Number(label="Trihalomethanes", default=None),
    gr.inputs.Number(label="Turbidity", default=None)
]

interface = gr.Interface(
    fn=predict_potability,
    inputs=input_components,
    outputs="text",
    title="Water Potability Prediction",
    description="Enter water quality metrics to predict whether the water is potable."
)

interface.launch(share=True)


ModuleNotFoundError: No module named 'gradio'

In [13]:
pip install gradio




In [20]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import gradio as gr

# Load the dataset and prepare the model
data = pd.read_csv('water_potability.csv')
X = data.drop(columns="Potability")
y = data["Potability"]

# Define a pipeline with mean imputation and Random Forest Classifier with adjusted parameters
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")), 
    ("classifier", RandomForestClassifier(
        n_estimators=100,        # Reduced number of trees
        max_depth=10,            # Limiting depth of each tree
        max_features="sqrt",     # Consider sqrt(n_features) at each split
        random_state=42
    ))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Calculate training and testing accuracy
train_accuracy = accuracy_score(y_train, pipeline.predict(X_train))
test_accuracy = accuracy_score(y_test, pipeline.predict(X_test))
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

# Define the prediction function for Gradio
def predict_potability(ph, hardness, solids, chloramines, sulfate, conductivity, organic_carbon, trihalomethanes, turbidity):
    input_data = pd.DataFrame([[ph, hardness, solids, chloramines, sulfate, conductivity, organic_carbon, trihalomethanes, turbidity]], columns=X.columns)
    prediction = pipeline.predict(input_data)[0]
    return "Potable" if prediction == 1 else "Not Potable"

# Updated input components
input_components = [
    gr.Number(label="pH"),
    gr.Number(label="Hardness"),
    gr.Number(label="Solids"),
    gr.Number(label="Chloramines"),
    gr.Number(label="Sulfate"),
    gr.Number(label="Conductivity"),
    gr.Number(label="Organic Carbon"),
    gr.Number(label="Trihalomethanes"),
    gr.Number(label="Turbidity")
]

# Define the Gradio interface
interface = gr.Interface(
    fn=predict_potability,
    inputs=input_components,
    outputs="text",
    title="Water Potability Prediction",
    description=f"Enter water quality metrics to predict whether the water is potable.\nTraining Accuracy: {train_accuracy:.2f}\nTesting Accuracy: {test_accuracy:.2f}"
)

interface.launch(share=True)


Training Accuracy: 0.83
Testing Accuracy: 0.68
Running on local URL:  http://127.0.0.1:7865
Running on public URL: https://00c588313b0b7af193.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


