In [1]:
path = "data.csv"
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv(path, encoding = "utf-8")
data.head()

Unnamed: 0,Date,Total Solids,SS,BOD5,NH3,Org-N,P-TOT,SO4,TKN,PRCP_NOOA
0,01/01/2001,984.0,170.0,173,15.93,14.0,4.58,72.0,29.97,0.0
1,02/01/2001,924.0,122.0,174,15.52,14.4,4.52,72.0,29.96,0.0
2,03/01/2001,1135.0,300.0,303,14.19,21.9,6.06,,36.11,0.0
3,04/01/2001,1013.0,192.0,228,15.43,19.4,5.33,,34.83,0.0
4,05/01/2001,1088.0,226.0,210,14.77,19.3,5.41,,34.06,0.0


In [3]:
data.drop(['Date'], axis=1, inplace=True)

In [4]:
data.head()

Unnamed: 0,Total Solids,SS,BOD5,NH3,Org-N,P-TOT,SO4,TKN,PRCP_NOOA
0,984.0,170.0,173,15.93,14.0,4.58,72.0,29.97,0.0
1,924.0,122.0,174,15.52,14.4,4.52,72.0,29.96,0.0
2,1135.0,300.0,303,14.19,21.9,6.06,,36.11,0.0
3,1013.0,192.0,228,15.43,19.4,5.33,,34.83,0.0
4,1088.0,226.0,210,14.77,19.3,5.41,,34.06,0.0


In [5]:
data.isnull().sum()

Total Solids      7
SS               10
BOD5            186
NH3               9
Org-N             9
P-TOT             6
SO4               2
TKN               7
PRCP_NOOA         0
dtype: int64

In [6]:
len(data)

6574

In [7]:
data.dropna(inplace=True)

In [8]:
len(data)

6381

In [9]:
cols = data.columns

for col in data.columns:
    for i in data.index:
        x = data.loc[i, col]
        try:
            data.loc[i, col] = float(x)
        except:
            data.loc[i, col] = None

In [10]:
data.isnull().sum()

Total Solids       0
SS                 0
BOD5               2
NH3                0
Org-N              0
P-TOT              1
SO4             5467
TKN                0
PRCP_NOOA          0
dtype: int64

In [11]:
data.drop(['SO4'], axis=1, inplace=True)

In [12]:
data.dropna(inplace=True)

In [13]:
len(data)

6378

In [14]:
min_value = data['TKN'].min()
max_value = data['TKN'].max()
range_value = max_value - min_value
print(f"""min = {min_value}
max = {max_value}
range = {range_value}
""")

min = 0.4
max = 142.3
range = 141.9



In [15]:
def transform(x, k):
    step = range_value/k
    obj = dict()
    for i in range(k):
        left = min_value + i * step
        right = left + step
        if x >= left and x <= right:
            return i
    return 0

def transform_data(data, k):
    copy = data.copy()
    for i in copy.index:
        copy[i] = transform(copy[i], k) 
    return copy

In [16]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def applyModel(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
    clf = svm.SVC(kernel='linear') # Linear Kernel
    
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    print(f"accuracy score = {accuracy_score(y_test, y_pred)}")
    print(f"Precision = {precision_score(y_test, y_pred, average='micro')}")
    print(f"Recall = {recall_score(y_test, y_pred, average='micro')}")
    print(f"F1 Score = {f1_score(y_test, y_pred, average='micro')}")

In [17]:
X = data.drop(['TKN'], axis = 1)
y = data['TKN']

y_20 = transform_data(y, 20)
y_40 = transform_data(y, 40)
y_80 = transform_data(y, 80)

In [18]:
applyModel(X, y_20)

accuracy score = 0.9749216300940439
Precision = 0.9749216300940439
Recall = 0.9749216300940439
F1 Score = 0.9749216300940439


In [19]:
applyModel(X, y_40)

accuracy score = 0.9553291536050157
Precision = 0.9553291536050157
Recall = 0.9553291536050157
F1 Score = 0.9553291536050157


In [20]:
applyModel(X, y_80)

accuracy score = 0.9333855799373041
Precision = 0.9333855799373041
Recall = 0.9333855799373041
F1 Score = 0.9333855799373041
