In [2]:
import numpy as np
import pandas as pd

from scipy import stats
from matplotlib import pyplot as plt
from sklearn import linear_model
from mlxtend import feature_selection
from sklearn import preprocessing

In [3]:
# data

fy = lambda x1, x2, x3, e: 3 * x1 - 2 * x2 + x3 + e

n_vars = 3
size = 20

nois = stats.norm.rvs(size=size)

X = np.array([[stats.uniform.rvs(scale=10) for _ in range(n_vars)] for _ in range(size)])
y = fy(*(X[:, i] for i in range(n_vars)), nois)

In [4]:
lr_model = linear_model.LinearRegression()
lr_model.fit(X=X, y=y)

lr_model.coef_

array([ 3.00542962, -2.0805196 ,  1.02351958])

In [5]:
# rss, rse, nu

err = np.array(y - lr_model.predict(X=X))
n = len(err)

rss = err.dot(err)

rse = np.sqrt(rss / (n - 2))

tss = np.var(y) * n

nu = (tss - rss) / tss

print(f"RSS {rss}")
print(f"TSS {tss}")
print(f"RSE {rse}")
print(f"NU {nu}")

RSS 12.882075865451803
TSS 1561.946165171977
RSE 0.8459733337487589
NU 0.9917525481014045


In [6]:
# feature selector

In [26]:
# data 

path = r"/home/aser/Documents/labs/s7/big_data/laba8/data/w_data.csv"
tdata = pd.read_csv(path, sep=";", encoding="utf-16", comment="#", error_bad_lines=False)
tdata = pd.DataFrame({"datetime": tdata["Local time in Volgograd (airport)"], "T": tdata["T"]}).dropna()

print(tdata)

def line_criterius(datatime: str) -> bool:
    data, time = datatime.split(" ")
    day, mon, year = data.split(".")
    hour, minut = time.split(":")
    dt_dict = {
        "year": int(year),
        "mon": int(mon),
        "day": int(day),
        "hour": int(hour),
        "min": int(minut)  
    }
    return dt_dict

new_features = [
    "year",
    "mon",
    "day",
    "hour",
    "min"
]

for new_feature in new_features:
    tdata[new_feature] = tdata["datetime"].apply(lambda line: line_criterius(line)[new_feature])

tdata = tdata.loc[tdata["min"] == 0]
tdata = tdata.loc[tdata["hour"] == 0]
tdata = tdata.drop(["datetime", "min", "hour"], axis=1)
print(tdata)

y_data = tdata["T"]
X_data = tdata.drop("T", axis=1)

y = np.array(y_data)
X = np.array(X_data)

b'Skipping line 56081: expected 14 fields, saw 16\nSkipping line 63628: expected 14 fields, saw 16\n'


               datetime     T
0      01.01.2021 23:30  -2.0
1      01.01.2021 23:00  -3.0
2      01.01.2021 22:30  -3.0
3      01.01.2021 22:00  -3.0
4      01.01.2021 21:30  -3.0
...                 ...   ...
80993  01.01.2016 02:30 -13.0
80994  01.01.2016 02:00 -13.0
80995  01.01.2016 01:30 -12.0
80996  01.01.2016 01:00 -12.0
80997  01.01.2016 00:00 -12.0

[80994 rows x 2 columns]
          T  year  mon  day
47     -4.0  2021    1    1
95     -5.0  2020   12   31
142    -8.0  2020   12   30
190    -5.0  2020   12   29
238    -5.0  2020   12   28
...     ...   ...  ...  ...
80850 -20.0  2016    1    5
80884 -16.0  2016    1    4
80922 -16.0  2016    1    3
80962 -14.0  2016    1    2
80997 -12.0  2016    1    1

[1780 rows x 4 columns]


In [56]:
# poly regression

degree = 6
poly_reg = preprocessing.PolynomialFeatures(degree=degree)
X_poly = poly_reg.fit_transform(X)

lr_model = linear_model.LinearRegression()
feature_selector = feature_selection.SequentialFeatureSelector(lr_model, 
    k_features=32,
    forward=True)

features = feature_selector.fit(X=X_poly, y=y)
features.k_feature_idx_


(0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 16,
 17,
 18,
 22,
 23,
 30,
 31,
 37,
 43,
 50,
 58,
 63,
 66,
 67,
 71,
 72,
 77,
 78,
 83)

In [57]:
lr_model = linear_model.LinearRegression()
mX = X_poly[:, features.k_feature_idx_]
lr_model.fit(X=mX, y=y)
predict_y = lr_model.predict(X=mX)

In [58]:
err = np.array(y - predict_y)
n = len(err)

rss = err.dot(err)

rse = np.sqrt(rss / (n - 2))

tss = np.var(y) * n

nu = (tss - rss) / tss

print(f"N {n}")
print(f"RSS {rss}")
print(f"TSS {tss}")
print(f"RSE {rse}")
print(f"NU {nu}")

N 1780
RSS 36698.18949741297
TSS 203997.547752809
RSE 4.54314335822106
NU 0.8201047517400482
