In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import warnings                                            
warnings.filterwarnings('ignore') 

In [None]:
#Function that will help to know the percentage of null values per column

def porc_null_col(dataframe):
    n = dataframe.shape[0]
    l = []
    for i in dataframe:
        l.append([i,((dataframe[i].isnull().sum())/n)*100])
    return l

In [None]:
df = pd.read_csv("../input/water-potability/water_potability.csv")

In [None]:
df.head()

# Feature Engineering

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum(axis=0)

In [None]:
df.shape

Vemos cuantos datos faltantes (en porcentaje con relación al total) tenemos en las columnas que presentan datos nulos.

In [None]:
porc_null_col(df)

Al no ser muy elevado el porcentaje podemos ver de aplicar algun tratamiento de "parchado". 
<br>
Ya que la columna de 'Trihalomethanes' tiene porcentaje de valores nulos menor al 5% podemos eliminar estos datos.


In [None]:
df.dropna(subset = ["Trihalomethanes"], inplace=True)

In [None]:
porc_null_col(df)

In [None]:
df.shape

Tratamiento para la columna ph

In [None]:
df.hist(column='ph')

In [None]:
df.boxplot(column='ph', return_type='axes');

In [None]:
df['ph'].fillna((df['ph'].mean()), inplace=True)

In [None]:
porc_null_col(df)

Tratamiento para la columna de "Sulfate"

In [None]:
df.hist(column='Sulfate')

In [None]:
df.boxplot(column='Sulfate', return_type='axes');

In [None]:
df['Sulfate'].fillna((df['Sulfate'].median()), inplace=True)

In [None]:
porc_null_col(df)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['Potability'] = le.fit_transform(df['Potability'])

In [None]:
y = df['Potability']
X = df.drop(['Potability'], axis =1)

In [None]:
y


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
print(X.shape)
print(len(y))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, shuffle=True)

In [None]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

# Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

rfc_clf = RandomForestClassifier()
params = {'n_estimators':[25,50,100,150,200,500],'max_depth':[0.5,1,5,10],'random_state':[1,10,20,42],
          'n_jobs':[1,2]}
grid_search_cv = GridSearchCV(rfc_clf, params, scoring='precision')
grid_search_cv.fit(X_train, y_train)

In [None]:
print(grid_search_cv.best_estimator_)
print(grid_search_cv.best_params_)

In [None]:
rfc_clf = grid_search_cv.best_estimator_
rfc_clf.fit(X_train,y_train)
rfc_clf_pred = rfc_clf.predict(X_test)

print('Classification report:')
print(classification_report(rfc_clf_pred,y_test))
score_rfc = cross_val_score(rfc_clf,X_train,y_train,cv = 10).mean()

print('Accuracy:',accuracy_score(rfc_clf_pred,y_test) )
print('cross val score:', score_rfc)

# Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
# Implement gridsearchcv to see which are our best p

params = {'C': [0.75, 0.85, 0.95, 1], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
          'degree': [3, 4, 5]}

svc_clf = SVC(random_state=42)
grid_search_cv = GridSearchCV(svc_clf, params)
grid_search_cv.fit(X_train, y_train)

In [None]:
print(grid_search_cv.best_estimator_)
print(grid_search_cv.best_params_)

In [None]:
svc_clf = grid_search_cv.best_estimator_
svc_clf.fit(X_train,y_train)
svc_pred = svc_clf.predict(X_test)


print('Classification report:')
print(classification_report(svc_pred,y_test))


score_svc = cross_val_score(svc_clf,X_train,y_train, cv = 10).mean()
print('Accuracy:',accuracy_score(svc_pred,y_test) )
print('cross val score:', score_svc)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_test)


print('Classification report:')
print(classification_report(lr_pred,y_test))


score_lr = cross_val_score(lr,X_train,y_train,cv=10).mean()
print('cross val score:', score_lr)
print('Accuracy:',accuracy_score(lr_pred,y_test) )

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)
gbc_pred = gbc.predict(X_test)



print('Classification report:')
print(classification_report(gbc_pred,y_test))


score_gbc = cross_val_score(gbc,X_train,y_train, scoring='accuracy', cv = 10).mean()
print('Accuracy:',accuracy_score(gbc_pred,y_test) )
print('cross val score:', score_gbc)

In [None]:
models = pd.DataFrame({'Models':['Random Forest Classifier','Logistic Regression', 
                                 'Gradient Boost Classifier', 'Support Vector Classifier'],
                      'Score':[score_rfc,score_lr,score_gbc,score_svc]})
models.sort_values(by='Score', ascending = False)