# Linear Regression Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import re

In [None]:
def train_test_val_split(X, y):
  random_state = random.randint(1, 100000)
  # split to train test validation
  # https://datascience.stackexchange.com/a/41436
  X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=random_state)
  X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=random_state)

  return X_train, X_test, y_train, y_test, X_train, X_val, y_train, y_val

In [None]:
bins_models = {}

In [None]:
cross_validations_count = 3

for bin_size in bins_sizes:
  # lets start to create a model with with the 8000 bins division
  # we will get the bins features
  df = bins_df[bin_size].groupby('bin', as_index=False).first()

  X = df.loc[:, df.columns.str.startswith('group_')]  # keep only features and protein levels
  y = df['protein_level_mean']

  bins_models[bin_size] = []  # cross validations iterations

  for cross_validation_iteration in range(cross_validations_count):
    X_train, X_test, y_train, y_test, X_train, X_val, y_train, y_val = train_test_val_split(X, y)

    regressor = LinearRegression(n_jobs=-1)

    features = list(X_train.columns)

    selected_features_corrs = []  # list of CORRs
    selected_features_names = []  # list of FEATURE_NAMEs

    selected_features_coefs = []  # list of coeffecients of selected features
    # it is used later for validation
    latest_intercept = None  # used for validation

    while True:
      features_corrs = []
      features_coefs = []

      # get corrs for all features
      for feature in progressbar.progressbar(list(X.columns)):
        # always fit with current feature and previous best features
        features_for_fit = [*selected_features_names, feature]
        # print(X_train[features_for_fit])
        try:
          b = regressor.fit(X_train[features_for_fit], y_train)  #.values.reshape(-1, 1), y_train)
        except ValueError as e:
          print(features_for_fit)
          print(X_train[features_for_fit])
          raise e
        # now calculate Y(protein_level)
        excpt = b.intercept_
        # now add coefficents * protein levels
        for i, fit in enumerate(features_for_fit):
          excpt += X_test[fit] * b.coef_[i]

        # excpt = (X_test[feature] * b.coef_[0] + b.intercept_)  # Y(protein_level) = mx+b
        latest_intercept = b.intercept_
        features_coefs.append(b.coef_[i])
        features_corrs.append(excpt.corr(y_test, method='spearman'))  # add correlation to the list
        # after running over all features, the one with the highest correlation will be chosen

      clear_output(wait=True)

      # find best corr
      best_feature_corr = max(features_corrs)
      best_feature_index = features_corrs.index(best_feature_corr)
      best_feature_coef = features_coefs[best_feature_index]

      # stop when current correlation is worse than the one before
      if selected_features_corrs and selected_features_corrs[-1] >= best_feature_corr:
        break

      if selected_features_names:
        print(
          f'(CV:{cross_validation_iteration})   [{bin_size}--{len(selected_features_corrs) + 1}] New best feature ({X.columns[best_feature_index]}) with {best_feature_corr} against {selected_features_corrs[-1]}')

      selected_features_corrs.append(best_feature_corr)
      selected_features_names.append(X.columns[best_feature_index])
      selected_features_coefs.append(best_feature_coef)

    bins_models[bin_size].append(dict(
      selected_features_names=selected_features_names,  # this three are correlated by index VVV
      selected_features_corrs=selected_features_corrs,  # ^^^VVV
      selected_features_coefs=selected_features_coefs,  # ^^^^^^
      intercept=latest_intercept,
      model=regressor,
      sets=(X_train, X_test, y_train, y_test,
            X_train, X_val, y_train, y_val)
    ))

clear_output(wait=True)
print(f"Trained {len(bins_models)} models, each {len(list(bins_models.values())[0])} times")
for bin_size, models in bins_models.items():
  for model_i, model in enumerate(models):
    print(
      f"[MODEL={model_i}]@> {bin_size} bins division has {len(model['selected_features_names'])} significant features and a success correlation of {model['selected_features_corrs'][-1]}\n")

In [None]:
# save trained models

file_idx = max(int(re.findall(r'Round(\d+)', fn)[0]) for fn in os.listdir("../models")) + 1

with open(f'ModelOnBinsResults_{cross_validations_count}CrossValidations_Round{file_idx}.pickle', 'wb') as f:
  pickle.dump(bins_models, f)