# Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Importing the data
df = pd.read_csv("oil_spill.csv")

In [3]:
df.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_41,f_42,f_43,f_44,f_45,f_46,f_47,f_48,f_49,target
0,1,2558,1506.09,456.63,90,6395000,40.88,7.89,29780.0,0.19,...,2850.0,1000.0,763.16,135.46,3.73,0,33243.19,65.74,7.95,1
1,2,22325,79.11,841.03,180,55812500,51.11,1.21,61900.0,0.02,...,5750.0,11500.0,9593.48,1648.8,0.6,0,51572.04,65.73,6.26,0
2,3,115,1449.85,608.43,88,287500,40.42,7.34,3340.0,0.18,...,1400.0,250.0,150.0,45.13,9.33,1,31692.84,65.81,7.84,1
3,4,1201,1562.53,295.65,66,3002500,42.4,7.97,18030.0,0.19,...,6041.52,761.58,453.21,144.97,13.33,1,37696.21,65.67,8.07,1
4,5,312,950.27,440.86,37,780000,41.43,7.03,3350.0,0.17,...,1320.04,710.63,512.54,109.16,2.58,0,29038.17,65.66,7.35,0


In [4]:
df.shape

(937, 50)

In [5]:
# Handling the null values
nv = df.isnull().sum()
nv = nv[nv>0]
nv

Series([], dtype: int64)

In [6]:
# Checking the duplicates
df.duplicated().sum()

0

In [7]:
df.columns

Index(['f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7', 'f_8', 'f_9', 'f_10',
       'f_11', 'f_12', 'f_13', 'f_14', 'f_15', 'f_16', 'f_17', 'f_18', 'f_19',
       'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_27', 'f_28',
       'f_29', 'f_30', 'f_31', 'f_32', 'f_33', 'f_34', 'f_35', 'f_36', 'f_37',
       'f_38', 'f_39', 'f_40', 'f_41', 'f_42', 'f_43', 'f_44', 'f_45', 'f_46',
       'f_47', 'f_48', 'f_49', 'target'],
      dtype='object')

In [8]:
df.dtypes.value_counts()

float64    39
int64      11
dtype: int64

In [9]:
df['target'].value_counts()

0    896
1     41
Name: target, dtype: int64

In [10]:
x = df.drop('target', axis=1)
y = df['target']

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(702, 49)
(235, 49)
(702,)
(235,)


In [14]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [15]:
def eval_model(ytest, y_pred):
    print("Accuracy Score", accuracy_score(ytest, y_pred))
    print("Classification Report", classification_report(ytest, y_pred))
    print("Confusion Matrix", confusion_matrix(ytest, y_pred))

def mscore(model):
    print("Training Score: ",model.score(X_train, y_train))
    print("Testing Score: ", model.score(X_test, y_test))

# Model Building

## Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
lr = LogisticRegression()

In [18]:
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
mscore(lr)

Training Score:  0.9444444444444444
Testing Score:  0.9446808510638298


In [20]:
y_pred_lr = lr.predict(X_test)

In [21]:
eval_model(y_test, y_pred_lr)

Accuracy Score 0.9446808510638298
Classification Report               precision    recall  f1-score   support

           0       0.97      0.98      0.97       226
           1       0.17      0.11      0.13         9

    accuracy                           0.94       235
   macro avg       0.57      0.54      0.55       235
weighted avg       0.93      0.94      0.94       235

Confusion Matrix [[221   5]
 [  8   1]]


## Decision tree Classifier

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
dtc = DecisionTreeClassifier()

In [24]:
dtc.fit(X_train, y_train)

In [25]:
mscore(dtc)

Training Score:  1.0
Testing Score:  0.9404255319148936


In [26]:
y_pred_dtc = dtc.predict(X_test)

In [27]:
eval_model(y_test, y_pred_dtc)

Accuracy Score 0.9404255319148936
Classification Report               precision    recall  f1-score   support

           0       0.98      0.96      0.97       226
           1       0.31      0.44      0.36         9

    accuracy                           0.94       235
   macro avg       0.64      0.70      0.67       235
weighted avg       0.95      0.94      0.95       235

Confusion Matrix [[217   9]
 [  5   4]]


## Random Forest Classification

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [30]:
mscore(rfc)

Training Score:  1.0
Testing Score:  0.9659574468085106


In [31]:
y_pred_rfc = rfc.predict(X_test)

In [32]:
eval_model(y_test, y_pred_rfc)

Accuracy Score 0.9659574468085106
Classification Report               precision    recall  f1-score   support

           0       0.97      0.99      0.98       226
           1       0.60      0.33      0.43         9

    accuracy                           0.97       235
   macro avg       0.79      0.66      0.71       235
weighted avg       0.96      0.97      0.96       235

Confusion Matrix [[224   2]
 [  6   3]]


## Bagging Classification

In [33]:
from sklearn.ensemble import BaggingClassifier

In [34]:
bagc= BaggingClassifier()

In [35]:
bagc.fit(X_train, y_train)

In [36]:
mscore(bagc)

Training Score:  0.9943019943019943
Testing Score:  0.9617021276595744


In [37]:
y_pred_bagc = bagc.predict(X_test)

In [38]:
eval_model(y_test, y_pred_bagc)

Accuracy Score 0.9617021276595744
Classification Report               precision    recall  f1-score   support

           0       0.98      0.98      0.98       226
           1       0.50      0.56      0.53         9

    accuracy                           0.96       235
   macro avg       0.74      0.77      0.75       235
weighted avg       0.96      0.96      0.96       235

Confusion Matrix [[221   5]
 [  4   5]]


## Gradient Boosting Classifier

In [39]:
from sklearn.ensemble import GradientBoostingClassifier

In [40]:
gbc = GradientBoostingClassifier()

In [41]:
gbc.fit(X_train, y_train)

In [42]:
mscore(gbc)

Training Score:  1.0
Testing Score:  0.9574468085106383


In [43]:
y_pred_gbc = gbc.predict(X_test)

In [44]:
eval_model(y_test, y_pred_gbc)

Accuracy Score 0.9574468085106383
Classification Report               precision    recall  f1-score   support

           0       0.98      0.98      0.98       226
           1       0.44      0.44      0.44         9

    accuracy                           0.96       235
   macro avg       0.71      0.71      0.71       235
weighted avg       0.96      0.96      0.96       235

Confusion Matrix [[221   5]
 [  5   4]]


## AdaBoost Classifier

In [45]:
from sklearn.ensemble import AdaBoostClassifier

In [46]:
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)

In [47]:
mscore(abc)

Training Score:  1.0
Testing Score:  0.9617021276595744


In [48]:
y_pred_abc = abc.predict(X_test)

In [49]:
eval_model(y_test, y_pred_abc)

Accuracy Score 0.9617021276595744
Classification Report               precision    recall  f1-score   support

           0       0.98      0.98      0.98       226
           1       0.50      0.44      0.47         9

    accuracy                           0.96       235
   macro avg       0.74      0.71      0.73       235
weighted avg       0.96      0.96      0.96       235

Confusion Matrix [[222   4]
 [  5   4]]


In [50]:
models = {"Logistic Regression":[0.9446808510638298],
         "DecisionTree Classifier":[0.9276595744680851],
         "RandomForest Classifier":[0.9659574468085106],
         "Bagging Classifier":[0.948936170212766],
         "GradientBoosting Classifier":[0.9574468085106383],
         "AdaBoosting Classifier":[0.9617021276595744]}

In [51]:
best_model = pd.DataFrame(models).T.reset_index()

In [52]:
best_model.columns = ["Model", "Accuracy Score"]

In [53]:
best_model

Unnamed: 0,Model,Accuracy Score
0,Logistic Regression,0.944681
1,DecisionTree Classifier,0.92766
2,RandomForest Classifier,0.965957
3,Bagging Classifier,0.948936
4,GradientBoosting Classifier,0.957447
5,AdaBoosting Classifier,0.961702


The best model is Random Forest Classifier has the highest accuracy score: 0.965957

In [54]:
import joblib

In [55]:
best_model = RandomForestClassifier()
best_model.fit(X_train, y_train)

In [56]:
filename = "best_model_rfc.pkl"
joblib.dump(best_model, filename)

['best_model_rfc.pkl']

In [57]:
filename = "best_model_random_forest.pkl"
loaded_model = joblib.load(filename)