In [1]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import RidgeClassifier
from lightgbm import LGBMClassifier

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, auc
import numpy as np
from sklearn.model_selection import cross_val_score
import time as time
import joblib

In [2]:
df = pd.read_csv('river_data.csv')
df.dtypes

S. No.        int64
River        object
As          float64
Cd          float64
Ni          float64
Fe          float64
Pb          float64
Zn          float64
Toxicity      int64
dtype: object

In [3]:
df.columns[df.isna().any()]

Index([], dtype='object')

##### Label Encoding

In [4]:
df

Unnamed: 0,S. No.,River,As,Cd,Ni,Fe,Pb,Zn,Toxicity
0,1,Parwati,0.896,0.083,2.070,0.048,2.386,0.008,0
1,2,Chittar,1.470,0.483,6.098,0.029,1.062,0.008,0
2,3,Barak,2.899,0.087,1.282,0.060,1.544,0.015,1
3,4,Yamuna,5.130,0.217,4.345,0.098,2.091,0.020,1
4,5,Banas,2.270,0.197,2.684,0.082,1.668,0.011,1
...,...,...,...,...,...,...,...,...,...
195,196,Wainganga,2.188,0.191,3.693,0.115,2.006,0.010,1
196,197,Tel,2.170,0.283,6.793,0.056,2.435,0.010,1
197,198,Teesta,7.390,0.104,2.455,0.712,2.284,0.033,1
198,199,Mahi,2.282,0.174,2.930,0.240,3.110,0.010,1


In [5]:
len(df['River'].unique())

134

In [6]:
df.shape

(200, 9)

In [7]:
df.drop(['S. No.','River'], axis=True, inplace=True)
df

Unnamed: 0,As,Cd,Ni,Fe,Pb,Zn,Toxicity
0,0.896,0.083,2.070,0.048,2.386,0.008,0
1,1.470,0.483,6.098,0.029,1.062,0.008,0
2,2.899,0.087,1.282,0.060,1.544,0.015,1
3,5.130,0.217,4.345,0.098,2.091,0.020,1
4,2.270,0.197,2.684,0.082,1.668,0.011,1
...,...,...,...,...,...,...,...
195,2.188,0.191,3.693,0.115,2.006,0.010,1
196,2.170,0.283,6.793,0.056,2.435,0.010,1
197,7.390,0.104,2.455,0.712,2.284,0.033,1
198,2.282,0.174,2.930,0.240,3.110,0.010,1


In [8]:
features = df.drop(['Toxicity'],axis='columns')
labels = df['Toxicity']

In [9]:
features

Unnamed: 0,As,Cd,Ni,Fe,Pb,Zn
0,0.896,0.083,2.070,0.048,2.386,0.008
1,1.470,0.483,6.098,0.029,1.062,0.008
2,2.899,0.087,1.282,0.060,1.544,0.015
3,5.130,0.217,4.345,0.098,2.091,0.020
4,2.270,0.197,2.684,0.082,1.668,0.011
...,...,...,...,...,...,...
195,2.188,0.191,3.693,0.115,2.006,0.010
196,2.170,0.283,6.793,0.056,2.435,0.010
197,7.390,0.104,2.455,0.712,2.284,0.033
198,2.282,0.174,2.930,0.240,3.110,0.010


In [10]:
labels

0      0
1      0
2      1
3      1
4      1
      ..
195    1
196    1
197    1
198    1
199    1
Name: Toxicity, Length: 200, dtype: int64

### In ML, LightGBM, XGBoost, RandomForest, DecisionTree, Multinomial NaiveBayes, Gaussian NaiveBayes, SVM, Ridge Classifier and Logistic Regression. Total of 9 in ML

#### 1. Using LightGBM

In [34]:
start_time=time.time()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size =0.2,random_state=40)

lgb_model = LGBMClassifier()
lgb_model.fit(X_train, y_train,)

y_pred = lgb_model.predict(X_test)
print(y_pred)

print(lgb_model.score(X_test,y_test))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

joblib.dump(lgb_model,'lgb_river_model')

[1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0
 1 1 1]
0.95
61.00034713745117 ms


['lgb_river_model']

###### 2. Using XGBOOST Classifier

In [35]:
start_time=time.time()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size =0.2,random_state=40)

xg_model = xgb.XGBClassifier(n_estimators=30, gamma=2)
xg_model.fit(X_train, y_train)

y_pred = xg_model.predict(X_test)
print(y_pred)

print(xg_model.score(X_test,y_test))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

joblib.dump(xg_model,'xgb_river_model')

[1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0
 1 1 1]
0.975
42.11854934692383 ms




['xgb_river_model']

###### 3. Using RandomForest Classifier

In [36]:
start_time=time.time()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print(y_pred)

print(rf_model.score(X_test,y_test))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

joblib.dump(rf_model,'rf_river_model')

[0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]
0.975
85.51406860351562 ms


['rf_river_model']

#### 4. Decision Tree Classifier

In [43]:
start_time=time.time()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state=40)

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

y_pred = dt_model.predict(X_test)
print(y_pred)

print(dt_model.score(X_test,y_test))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

joblib.dump(dt_model,'dt_river_model')

[1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0
 1 1 0]
0.925
14.509201049804688 ms


['dt_river_model']

#### 5. Multinomial Naive Bayes

In [44]:
start_time=time.time()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state=2)

mnb_model = MultinomialNB(alpha=12)
mnb_model.fit(X_train, y_train)

y_pred = mnb_model.predict(X_test)
print(y_pred)

print(mnb_model.score(X_test,y_test))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

joblib.dump(mnb_model,'mnb_river_model')

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]
0.925
13.502359390258789 ms


['mnb_river_model']

#### 6. Gaussian Naive Bayes

In [45]:
start_time=time.time()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state=13)

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

y_pred = gnb_model.predict(X_test)
print(y_pred)

print(gnb_model.score(X_test,y_test))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

joblib.dump(gnb_model,'gnb_river_model')

[0 1 1 0 1 1 1 1 1 0 0 1 0 1 0 0 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 0
 1 0 1]
0.9
12.999773025512695 ms


['gnb_river_model']

#### 7. SVM

In [46]:
start_time=time.time()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state=42)

svm_model = SVC(C=2, kernel='poly')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print(y_pred)

print(svm_model.score(X_test,y_test))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

joblib.dump(svm_model,'svm_river_model')

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1]
0.925
14.002323150634766 ms


['svm_river_model']

#### 8. Ridge Classifier

In [47]:
start_time=time.time()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2,random_state=42)

rg_model = RidgeClassifier(alpha=1)
rg_model.fit(X_train, y_train)

y_pred = rg_model.predict(X_test)
print(y_pred)

print(rg_model.score(X_test,y_test))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

joblib.dump(rg_model,'rg_river_model')

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]
0.95
14.041900634765625 ms


['rg_river_model']

#### 9. Logistic Regression

In [48]:
start_time=time.time()
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2,random_state=42)

logr_model = LogisticRegression(C=1)
logr_model.fit(X_train, y_train)

y_pred = logr_model.predict(X_test)
print(y_pred)

print(logr_model.score(X_test,y_test))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

joblib.dump(logr_model,'logr_river_model')

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1]
0.95
19.486188888549805 ms


['logr_river_model']

### Stratified KFold Cross validation

In [21]:
#Lightgbm
start_time=time.time()
lgb_score = cross_val_score(LGBMClassifier(n_estimators=50),features, labels)
print(lgb_score)
print("Score:",np.average(lgb_score))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

[0.875 0.875 0.9   0.85  0.85 ]
Score: 0.8699999999999999
101.51839256286621 ms


In [22]:
#XGBOOST
start_time=time.time()
xg_score = cross_val_score(xgb.XGBClassifier(n_estimators=30, gamma=2),features, labels)
print(xg_score)
print("Score:",np.average(xg_score))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

[0.95  0.85  0.9   0.85  0.975]
Score: 0.9049999999999999
195.64080238342285 ms




In [23]:
#Logistic regression
start_time=time.time()
lr_score = cross_val_score(LogisticRegression(C=1),features, labels)
print(lr_score)
print("Score:",np.average(lr_score))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

[0.85 0.85 0.85 0.8  0.8 ]
Score: 0.8299999999999998
89.00642395019531 ms


In [25]:
#Decision Tree
start_time=time.time()
dt_score = cross_val_score(DecisionTreeClassifier(),features, labels)
print(dt_score)
print("Score:",np.average(dt_score))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

[0.875 0.725 0.825 0.85  0.925]
Score: 0.8400000000000001
38.51461410522461 ms


In [26]:
#RandomForest
start_time=time.time()
rf_score = cross_val_score(RandomForestClassifier(n_estimators=30, random_state=50),features, labels)
print(rf_score)
print("Score:",np.average(rf_score))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

[0.925 0.85  0.925 0.85  0.925]
Score: 0.8950000000000001
290.9090518951416 ms


In [27]:
#SVM
start_time=time.time()
svm_score = cross_val_score(SVC(C=2, kernel='poly'),features, labels)
print(svm_score)
print("Score:",np.average(svm_score))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

[0.825 0.825 0.825 0.825 0.825]
Score: 0.825
43.039798736572266 ms


In [29]:
#Multinomial NB
start_time=time.time()
mnb_score = cross_val_score(MultinomialNB(),features, labels)
print(mnb_score)
print("Score:",np.average(mnb_score))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

[0.775 0.85  0.825 0.75  0.8  ]
Score: 0.8
32.53674507141113 ms


In [30]:
#Gaussian NB
start_time=time.time()
gnb_score = cross_val_score(GaussianNB(),features, labels)
print(gnb_score)
print("Score:",np.average(gnb_score))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

[0.6   0.725 0.775 0.75  0.775]
Score: 0.725
32.51814842224121 ms


In [33]:
#Ridge
start_time=time.time()
rg_score = cross_val_score(RidgeClassifier(alpha=2),features, labels)
print(rg_score)
print("Score:",np.average(rg_score))

finish_time=time.time()
time_taken=finish_time-start_time
print(time_taken*1000,"ms")

[0.825 0.825 0.825 0.825 0.825]
Score: 0.825
44.51918601989746 ms
