# Project 1
## Author: Will Brown
## Description
Analysis of water potability dataset found on [Kaggle](https://www.kaggle.com/datasets/adityakadiwal/water-potability)

## Begin Code

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Import dataset

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IndividualProject1/water_potability.csv")

In [None]:
dataset.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [None]:
dataset.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


In [None]:
dataset.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

## Impute missing values

In [29]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')
dataset.ph = imputer.fit_transform(dataset[['ph']])
dataset.Sulfate = imputer.fit_transform(dataset[['Sulfate']])
dataset.Trihalomethanes = imputer.fit_transform(dataset[['Trihalomethanes']])

In [30]:
dataset.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

## Assign X and y

In [31]:
X = dataset.drop('Potability', axis = 1)
y = dataset.Potability

## Normalize data

In [33]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

## Split data

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

## Testing models

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Perform randomized search CV

In [36]:
model_params = {
    'Logistic_Regression': {
        'model': LogisticRegression(),
        'params': {
            'penalty': ['none', 'l1', 'l2', 'elasticnet'],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
        } 
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10, 20, 100],
            'kernel': ['rbf', 'linear', 'poly']
        }
    },
    'Decision_Tree': {
        'model': DecisionTreeClassifier(),
        'params': {
          'criterion': ['gini', 'entropy'],
          'max_depth': [2, 3, 4, 5]
        }
    },
    'Random_Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'max_depth': [2, 3, 4, 5]
        }
    }
}

In [37]:
from sklearn.model_selection import RandomizedSearchCV
scores = []
for model_name, model_param in model_params.items():
  clf = RandomizedSearchCV(model_param['model'], model_param['params'])
  clf.fit(X_train, y_train)
  scores.append({
      'model': model_name,
      'best_score': clf.best_score_,
      'best_params': clf.best_params_
  })

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

-----------------------------------

### Model scores

In [38]:
scores_df = pd.DataFrame(scores)

In [44]:
scores_df

Unnamed: 0,model,best_score,best_params
0,Logistic_Regression,0.605725,"{'solver': 'liblinear', 'penalty': 'l1'}"
1,SVC,0.668321,"{'kernel': 'rbf', 'C': 10}"
2,Decision_Tree,0.633206,"{'max_depth': 5, 'criterion': 'gini'}"
3,Random_Forest,0.64084,"{'max_depth': 5, 'criterion': 'gini'}"


### Classification report

In [40]:
svc = SVC(kernel='rbf', C=10)
svc.fit(X_train, y_train)

SVC(C=10)

In [41]:
svc.score(X_test, y_test)

0.663109756097561

In [42]:
y_pred = svc.predict(X_test)

In [43]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[323  89]
 [132 112]]
              precision    recall  f1-score   support

           0       0.71      0.78      0.75       412
           1       0.56      0.46      0.50       244

    accuracy                           0.66       656
   macro avg       0.63      0.62      0.62       656
weighted avg       0.65      0.66      0.66       656

