## Imports

In [1]:
import json
import csv
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from imblearn.over_sampling import SMOTE, SMOTEN

## Functions

In [3]:
def load_dummy():
    return np.array([np.random.randint(100, size=3) for i in range(100)]), np.array([np.random.randint(10, size=1)[0] for i in range(100)])

In [4]:
def load_json(filename):
    with open(filename) as f:
        d = json.load(f)
    return d["x"], d["y"]

In [5]:
def load_csv(filename):
    with open(filename) as f:
        csvr = csv.reader(f)
        rows = []
        for r in csvr:
            rows.append(r)
    return rows

In [6]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [7]:
def eval_LR(x, filename):
    with open(filename) as f:
        d = json.load(f)
    p_w = d["w"]
    p_i = d["i"]
    ans = sigmoid(p_w@x + p_i)
    if ans > 0.5:
        return 1
    else:
        return 0

## Training

In [8]:
r1 = load_csv("../../data/dataset_annotated_v1.csv")
r2 = load_csv("../../data/dataset_annotated_v2.csv")
r3 = load_csv("../../data/dataset_annotated_v3.csv")
r = r1[1:] + r2[1:] + r3[1:]

In [9]:
SS = 0.77

In [10]:
x = []
y = []

for i in r:
    x.append([int(j) for j in i[3:6]])
    y.append(int(i[6]))

print(Counter(y))

oversample = SMOTEN(random_state=42, sampling_strategy=SS)
x, y = oversample.fit_resample(x, y)

print(Counter(y))

x = np.asarray(x, dtype='float64')
y = np.asarray(y, dtype='float64')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

lf = linear_model.LogisticRegression()

lf.fit(x_train, y_train)

x_test_0 = []
x_test_1 = []
y_test_0 = []
y_test_1 = []

for indx, i in enumerate(x_test):
    if y_test[indx] == 0:
        x_test_0.append(i)
        y_test_0.append(0)
    if y_test[indx] == 1:
        x_test_1.append(i)
        y_test_1.append(1)

y_pred = lf.predict(x_test)
y_pred_0 = lf.predict(x_test_0)
y_pred_1 = lf.predict(x_test_1)

Counter({1: 1001, 0: 169})
Counter({1: 1001, 0: 770})


## SS Tune

In [25]:
SS_TUNE = []

In [26]:
for SS in range(25, 101):
    SS = float(SS)/100
    x = []
    y = []

    for i in r:
        x.append([int(j) for j in i[3:6]])
        y.append(int(i[6]))

    oversample = SMOTEN(random_state=42, sampling_strategy=SS)
    x, y = oversample.fit_resample(x, y)


    x = np.asarray(x, dtype='float64')
    y = np.asarray(y, dtype='float64')

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

    lf = linear_model.LogisticRegression()

    lf.fit(x_train, y_train)

    x_test_0 = []
    x_test_1 = []
    y_test_0 = []
    y_test_1 = []

    for indx, i in enumerate(x_test):
        if y_test[indx] == 0:
            x_test_0.append(i)
            y_test_0.append(0)
        if y_test[indx] == 1:
            x_test_1.append(i)
            y_test_1.append(1)

    y_pred = lf.predict(x_test)
    y_pred_0 = lf.predict(x_test_0)
    y_pred_1 = lf.predict(x_test_1)
    SS_TUNE.append(lf.score(x_test_0, y_test_0) + lf.score(x_test_1, y_test_1))

In [27]:
maxi = (0, 0)

for indx, i in enumerate(SS_TUNE):
    if i > maxi[0]:
        maxi = (i, 25 + indx)

In [28]:
maxi

(1.771608643457383, 77)

In [29]:
SS = maxi[1]/100.0

In [30]:
x = []
y = []

for i in r:
    x.append([int(j) for j in i[3:6]])
    y.append(int(i[6]))

print(Counter(y))

oversample = SMOTEN(random_state=42, sampling_strategy=SS)
x, y = oversample.fit_resample(x, y)

print(Counter(y))

x = np.asarray(x, dtype='float64')
y = np.asarray(y, dtype='float64')

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

lf = linear_model.LogisticRegression()

lf.fit(x_train, y_train)

x_test_0 = []
x_test_1 = []
y_test_0 = []
y_test_1 = []

for indx, i in enumerate(x_test):
    if y_test[indx] == 0:
        x_test_0.append(i)
        y_test_0.append(0)
    if y_test[indx] == 1:
        x_test_1.append(i)
        y_test_1.append(1)

y_pred = lf.predict(x_test)
y_pred_0 = lf.predict(x_test_0)
y_pred_1 = lf.predict(x_test_1)

Counter({1: 1001, 0: 169})
Counter({1: 1001, 0: 770})


## Results

In [11]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         0.0       0.95      0.79      0.86       294
         1.0       0.82      0.96      0.88       291

    accuracy                           0.88       585
   macro avg       0.89      0.88      0.87       585
weighted avg       0.89      0.88      0.87       585



In [12]:
print(classification_report(y_pred_0, y_test_0, zero_division=True))

              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97       233
         1.0       1.00      0.00      0.00        12

    accuracy                           0.95       245
   macro avg       0.98      0.50      0.49       245
weighted avg       0.95      0.95      0.93       245



In [14]:
print(classification_report(y_pred_1, y_test_1, zero_division=True))

              precision    recall  f1-score   support

         0.0       1.00      0.00      0.00        61
         1.0       0.82      1.00      0.90       279

    accuracy                           0.82       340
   macro avg       0.91      0.50      0.45       340
weighted avg       0.85      0.82      0.74       340



In [15]:
def print_misclass():
    for indx, y1 in enumerate(y_pred_1):
        if y1 != y_test_1[indx]:
            print(x_test_1[indx])

In [16]:
lf.score(x_test, y_test)

0.8752136752136752

In [17]:
lf.coef_, lf.intercept_

(array([[-0.57813246,  0.11088761,  1.41673913]]), array([-1.69675788]))

In [18]:
lf.coef_.tolist()[0], lf.intercept_.tolist()[0]

([-0.578132462427741, 0.11088760591222886, 1.4167391261129125],
 -1.696757879709485)

In [19]:
with open("./weights_lr2.json", "w") as f:
    json.dump({
        "w" : lf.coef_.tolist()[0],
        "i": lf.intercept_.tolist()[0]
    }, f)

## Testing

In [20]:
filename = "weights_lr2.json"

In [21]:
y_pred = [eval_LR(x, filename) for x in x_test]
y_pred_0 = [eval_LR(x, filename) for x in x_test_0]
y_pred_1 = [eval_LR(x, filename) for x in x_test_1]

In [22]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.95      0.79      0.86       294
           1       0.82      0.96      0.88       291

    accuracy                           0.88       585
   macro avg       0.89      0.88      0.87       585
weighted avg       0.89      0.88      0.87       585



In [23]:
print(classification_report(y_pred_0, y_test_0, zero_division=True))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       233
           1       1.00      0.00      0.00        12

    accuracy                           0.95       245
   macro avg       0.98      0.50      0.49       245
weighted avg       0.95      0.95      0.93       245



In [24]:
print(classification_report(y_pred_1, y_test_1, zero_division=True))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00        61
           1       0.82      1.00      0.90       279

    accuracy                           0.82       340
   macro avg       0.91      0.50      0.45       340
weighted avg       0.85      0.82      0.74       340

