In [3]:
import pandas as pd
import json

with open("option_columns.json","r") as f:
    option_columns = json.load(f)

size = pd.read_csv("dataset_encoded_size.csv", dtype={k:"int8" for k in option_columns})

len(size)

110853

In [4]:
size.query("cid >= 30000", inplace=True)
size.fillna(-1, inplace=True)
size.query("kernel_size >= 0", inplace=True)

len(size)

92473

In [5]:
size[:5]

Unnamed: 0,104_QUAD_8,21285_WATCHDOG,3C515,53C700_BE_BUS,53C700_LE_ON_BE,60XX_WDT,64BIT,6LOWPAN,6LOWPAN_DEBUGFS,6LOWPAN_GHC_EXT_HDR_DEST,...,ZSWAP,ZX2967_PM_DOMAINS,ZX2967_THERMAL,ZX2967_WATCHDOG,ZX_DMA,ZX_I2S,ZX_SPDIF,ZX_TDM,cid,kernel_size
14758,2,0,0,0,0,1,0,1,0,1,...,0,0,0,0,1,1,1,1,30000,50222120
14759,1,0,0,0,0,2,0,1,0,1,...,0,0,1,0,0,1,1,1,30001,16660024
14760,1,0,0,0,0,2,0,1,0,1,...,0,0,1,0,1,1,1,1,30002,43080856
14761,1,0,0,0,0,1,0,1,0,1,...,0,0,1,0,1,1,1,1,30003,27261672
14762,1,0,0,0,0,1,0,1,0,1,...,0,0,1,0,1,1,1,2,30004,58769440


In [6]:
len(size.columns)

12638

In [30]:
#learning a model by using multiple linear regression
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(size.drop(columns="cid").drop(columns="kernel_size"), size["kernel_size"], test_size=0.1) 

#print(x_train.values)
#print(y_train.values)

x_train, y_train = np.array(x_train.values), np.array(y_train.values)
#x_test, y_test = np.array(x_test.values), np.array(y_test.values)

model = LinearRegression().fit(x_train, y_train)

r_sq = model.score(x_train, y_train)
print('coefficient of determination:', r_sq)
print('intercept:', model.intercept_)
print('slope:', model.coef_)


coefficient of determination: 0.7164351231012369
intercept: -115284849455.55917
slope: [-5.15771067e+04  6.58999578e+13  3.90374873e+14 ... -5.90119620e+05
 -1.19811776e+05  4.88486824e+05]


In [31]:
print(len(model.coef_))

12636


In [32]:
i, n, p = 0, 0, 0
for x in range(len(model.coef_)): 
    if (model.coef_[x] != 0):
        i = i + 1
        if (model.coef_[x] > 0):
            p = p + 1
        else:
            n = n + 1

print('Number of options whose have inflence on size:', i)
print('Number of options whose have positive inflence on size:', p)
print('Number of options whose have negative inflence on size:', n)

Number of options whose have inflence on size: 11665
Number of options whose have positive inflence on size: 5947
Number of options whose have negative inflence on size: 5718


In [33]:
#validation
y_pred = model.predict(x_test)
#y_pred = model.intercept_ + np.sum(model.coef_ * x_test.values, axis=1)

#print(y_test)
#print(y_pred)

dfErrors = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred, "error":(y_pred - y_test).abs(), "% error":((y_pred - y_test)/y_test).abs()*100})
dfErrors["% error"].describe()

count    9248.000000
mean       53.773600
std        47.889136
min         0.005139
25%        18.483724
50%        40.829736
75%        74.909965
max       401.769611
Name: % error, dtype: float64

In [None]:
#learning a model by using polynomial regression degree=2

import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(size.drop(columns="cid").drop(columns="kernel_size"), size["kernel_size"], test_size=0.1) 

#print(x_train.values)
#print(y_train.values)

x_train, y_train = np.array(x_train.values), np.array(y_train.values)
#x_test, y_test = np.array(x_test.values), np.array(y_test.values)

x_train_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x_train)
model = LinearRegression().fit(x_train_, y_train)

r_sq = model.score(x_train_, y_train)
print('coefficient of determination:', r_sq)
print('intercept:', model.intercept_)
print('slope:', model.coef_)

#for 2 options we have: 𝑓(𝑥₁, 𝑥₂) = 𝑏₀ + 𝑏₁𝑥₁ + 𝑏₂𝑥₂ + 𝑏₃𝑥₁² + 𝑏₄𝑥₁𝑥₂ + 𝑏₅𝑥₂²
#in our case we have 12636 options


In [None]:
#validation
x_test_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x_test)
y_pred = model.predict(x_test_)

#print(y_test)
#print(y_pred)

dfErrors = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred, "error":(y_pred - y_test).abs(), "% error":((y_pred - y_test)/y_test).abs()*100})
dfErrors["% error"].describe()