In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
main_dataset = pd.read_csv("main.csv")

In [3]:
main_dataset.shape

(324, 3)

In [4]:
main_dataset.head()

Unnamed: 0,Rainfall,River,Label
0,6.2,3.914839,0
1,54.9,4.627857,0
2,16.4,3.305161,0
3,56.8,4.457667,0
4,167.4,9.347742,1


In [5]:
main_dataset.tail()

Unnamed: 0,Rainfall,River,Label
319,158.2,2.917667,0
320,258.5,2.738333,0
321,8.2,2.1357,0
322,0.0,2.347419,0
323,102.0,6.1,1


In [6]:
main_dataset.fillna('')

Unnamed: 0,Rainfall,River,Label
0,6.2,3.914839,0
1,54.9,4.627857,0
2,16.4,3.305161,0
3,56.8,4.457667,0
4,167.4,9.347742,1
...,...,...,...
319,158.2,2.917667,0
320,258.5,2.738333,0
321,8.2,2.135700,0
322,0.0,2.347419,0


In [7]:
main_dataset.describe()

Unnamed: 0,Rainfall,River,Label
count,324.0,324.0,324.0
mean,109.379938,13.458754,0.373457
std,125.759874,22.483928,0.48447
min,0.0,1.065419,0.0
25%,7.15,2.738333,0.0
50%,58.05,4.274333,0.0
75%,167.25,15.902905,1.0
max,485.4,166.8161,1.0


In [8]:
main_dataset['Label'].value_counts()

0    203
1    121
Name: Label, dtype: int64

In [9]:
X = main_dataset.drop(columns = 'Label', axis = 1)
Y = main_dataset['Label']

In [10]:
print(X)

     Rainfall     River
0         6.2  3.914839
1        54.9  4.627857
2        16.4  3.305161
3        56.8  4.457667
4       167.4  9.347742
..        ...       ...
319     158.2  2.917667
320     258.5  2.738333
321       8.2  2.135700
322       0.0  2.347419
323     102.0  6.100000

[324 rows x 2 columns]


In [11]:
print(Y)

0      0
1      0
2      0
3      0
4      1
      ..
319    0
320    0
321    0
322    0
323    1
Name: Label, Length: 324, dtype: int64


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 42)

In [13]:
print(X_train.shape, Y_train.shape,X_test.shape,Y_test.shape)

(259, 2) (259,) (65, 2) (65,)


In [14]:
print(X_train)

     Rainfall      River
118       6.2   3.528387
224     257.0   3.267097
61       25.3   3.163871
39        6.0   1.065419
301       3.6   4.370000
..        ...        ...
34        0.0   3.358387
172     189.0  51.883870
122      11.6   2.531290
128     199.1   4.339333
130       0.8   2.567419

[259 rows x 2 columns]


In [15]:
print(X_test)

     Rainfall      River
1        54.9   4.627857
93       11.9   2.990667
148     119.5  20.223870
252       4.0   1.523000
142       0.0   1.972323
..        ...        ...
120      15.6   2.263226
36        4.2   1.795714
158      86.3   3.939355
169      19.1   2.164667
319     158.2   2.917667

[65 rows x 2 columns]


In [16]:
#print(Y_train)

In [17]:
class LogisticRegression():
    
    def __init__(self, learning_rate, no_of_iterations):
        self.learning_rate = learning_rate
        self.no_of_iterations = no_of_iterations
        
    def fit(self, X, Y):
        self.m, self.n = X.shape
        
        self.w = np.zeros(self.n)
        self.b = 0
        self.X = X
        self.Y = Y
        
        for i in range(self.no_of_iterations):
            self.update_weights()
            
    def update_weights(self):
        
        Y_hat = 1/(1 + np.exp(-((self.X).dot(self.w) + self.b)))
        
        dw = (self.X.T).dot(Y_hat-self.Y) / self.m

        db = np.sum(Y_hat - self.Y) / self.m
        
        self.w = self.w - self.learning_rate*dw
        self.b = self.b - self.learning_rate*db
        
        
    def predict(self, X):
        
        Y_pred = 1 / (1 + np.exp( - (X.dot(self.w) + self.b ) )) 
        Y_pred = np.where( Y_pred > 6, 1, 0)
        return Y_pred


In [18]:
model = LogisticRegression(learning_rate = 0.01, no_of_iterations = 10000)

In [19]:
#training the model
model.fit(X_train, Y_train)

In [20]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test,X_test_prediction)

In [21]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.6307692307692307


In [22]:
X_train_prediction = model.predict(X_train) 
train_data_accuracy = accuracy_score(Y_train,X_train_prediction)

In [23]:
print('Accuracy score of the train data : ', train_data_accuracy)

Accuracy score of the train data :  0.6254826254826255


In [24]:
from joblib import dump


In [25]:
dump(model, './model.joblib')

['./model.joblib']

In [26]:
rain = np.array([[428.5, 35.73226]])

In [27]:
print(rain)

[[428.5      35.73226]]


In [28]:
result = model.predict(rain)
print(result)

[0]


In [29]:
from sklearn import metrics

In [30]:
MSE_train = metrics.mean_absolute_error(Y_train, X_train_prediction)
MSE_test = metrics.mean_absolute_error(Y_test, X_test_prediction)

In [31]:
print(MSE_train)

0.3745173745173745


In [32]:
print(MSE_test)

0.36923076923076925
