# Data Loading & Preprocessing

In [107]:
import pandas as pd
names = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity'
         , 'Organic_carbon', 'Trihalomethanes','Turbidity','Potability'] 
feature_names = names[:9]
data = pd.read_csv("C:/Users/Usuario/Desktop/QFM/S3/project/water_potability.csv")
data[:5]

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


  #### Remark:  We have some missing values, we opted for replacing those values with the mean of its corresponding column to not lose information since it takes up a big portion of our data.

In [109]:
column_means = data.mean()
df = data.fillna(column_means)
df[:5]

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,7.080795,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,333.775777,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,333.775777,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [110]:
X = df.drop('Potability', axis=1)
y = df['Potability']

# Feature Selection

In [111]:
from sklearn.feature_selection import mutual_info_classif
features = []
MI_score = mutual_info_classif(X, y, random_state=0)
for feature in zip(feature_names, MI_score):
    features.append(feature)
features

[('ph', 0.0),
 ('Hardness', 0.0265597472598873),
 ('Solids', 0.0010659025462331773),
 ('Chloramines', 0.0),
 ('Sulfate', 0.005699551188196095),
 ('Conductivity', 0.007077236475107718),
 ('Organic_carbon', 0.004013875739670603),
 ('Trihalomethanes', 0.0),
 ('Turbidity', 0.003075177105070992)]

In [112]:
Selected_features = ['Hardness','Sulfate','Conductivity','Organic_carbon']
Selected_df = df[Selected_features]
Selected_df[:5]

Unnamed: 0,Hardness,Sulfate,Conductivity,Organic_carbon
0,204.890455,368.516441,564.308654,10.379783
1,129.422921,333.775777,592.885359,15.180013
2,224.236259,333.775777,418.606213,16.868637
3,214.373394,356.886136,363.266516,18.436524
4,181.101509,310.135738,398.410813,11.558279


### Scaling 

In [113]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
Selected_df = scaler.fit_transform(Selected_df)

In [114]:
# import train-test split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Selected_df, y, test_size=0.33, random_state=42)

# Models

### Logistic Regression 

In [115]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
model_lg = LogisticRegression(max_iter=120,random_state=0, n_jobs=20)
model_lg.fit(X_train, y_train)
pred_lg = model_lg.predict(X_test)
lg = accuracy_score(y_test, pred_lg)
cm1 = confusion_matrix(y_test, pred_lg)
print("accuracy :",lg)
print(classification_report(y_test,pred_lg))

accuracy : 0.6284658040665434
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       680
           1       0.00      0.00      0.00       402

    accuracy                           0.63      1082
   macro avg       0.31      0.50      0.39      1082
weighted avg       0.39      0.63      0.49      1082



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree

In [116]:
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier( max_depth=4, random_state=42)
model_dt.fit(X_train,y_train)
pred_dt = model_dt.predict(X_test)
dt = accuracy_score(y_test, pred_dt)
cm2 = confusion_matrix(y_test, pred_dt)
print("accuracy :",dt)
print(classification_report(y_test,pred_dt))

accuracy : 0.634011090573013
              precision    recall  f1-score   support

           0       0.64      0.95      0.77       680
           1       0.54      0.10      0.17       402

    accuracy                           0.63      1082
   macro avg       0.59      0.53      0.47      1082
weighted avg       0.60      0.63      0.54      1082



### Random Forest

In [117]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=300,min_samples_leaf=0.16, random_state=42)
model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_test)
rf = accuracy_score(y_test, pred_rf)
cm3 = confusion_matrix(y_test, pred_rf)
print("accuracy :",rf)
print(classification_report(y_test,pred_rf))

accuracy : 0.6284658040665434
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       680
           1       0.00      0.00      0.00       402

    accuracy                           0.63      1082
   macro avg       0.31      0.50      0.39      1082
weighted avg       0.39      0.63      0.49      1082



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Pickling the Model

In [118]:
import pickle
# Save the trained model to a file
with open('model_dt.pkl', 'wb') as file:
    pickle.dump(model_dt, file)  

# Deployment 

# index.html