Add

https://colab.research.google.com/drive/1b6ubEvtESajClbxcc7YZ7vtVhj2mdidz

# Imports + Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import roc_curve, auc

import plotly.graph_objects as go
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

In [2]:
STD_DATA_PATH = 'https://raw.githubusercontent.com/auplt/Flight-Satisfaction/data/data_std.pkl'
Y_FEATURE = 'satisfaction'
TEST_SIZE = 0.3

GRAPH_WIDTH = 700
GRAPH_HEIGHT = 500

In [3]:
# Загрузка данных
df = pd.read_pickle(STD_DATA_PATH)

X, y = df.drop([Y_FEATURE], axis=1), df[Y_FEATURE]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y)

y_train = y_train.to_frame()
y_test = y_test.to_frame()

X_train = X_train.reset_index().drop(columns='index')
X_test = X_test.reset_index().drop(columns='index')
y_train = y_train.reset_index().drop(columns='index')
y_test = y_test.reset_index().drop(columns='index')

In [4]:
DELTA = 2
ALL_FEATURES = list(X_train.columns)
COUNT_FEATURES = len(ALL_FEATURES)

# Model

In [5]:
C = 0.001
TOL = 100
CLASS_WEIGHT = "balanced"

In [6]:
CLF = LogisticRegression(C=C, tol=TOL, class_weight=CLASS_WEIGHT)
CLF

In [7]:
class Model():
  def __init__(self, columns):
    self.clf = CLF
    self.subset_X_train = X_train[columns]
    self.subset_X_test =  X_test[columns]

  def fit(self):
    self.clf.fit(self.subset_X_train, y_train)


  def y_pred_train(self):
    return self.clf.predict(self.subset_X_train)

  def y_pred_test(self):
    return self.clf.predict(self.subset_X_test)


  def train_score(self):
    return f1_score(y_train, self.y_pred_train())

  def test_score(self):
    return f1_score(y_test, self.y_pred_test())


  def y_pred_train_prob(self):
    return self.clf.predict_proba(self.subset_X_train)[:, 1]

  def y_pred_test_prob(self):
    return self.clf.predict_proba(self.subset_X_test)[:, 1]


  def score(self):
    return [accuracy_score(y_test, self.y_pred_test()), precision_score(y_test, self.y_pred_test()), recall_score(y_test, self.y_pred_test()), f1_score(y_test, self.y_pred_test()), roc_auc_score(y_test, self.y_pred_test())]

# Add-Del

In [8]:
class Add_Dell():
  def __init__(self, delta, all_features, count_features, metric='f1', test_mode=False):
    self.test_mode = test_mode
    self.all_features = all_features.copy()
    self.count_features = count_features
    self.delta = delta
    self.metric = metric

    self.iteration = 0
    self.ts_iteration = 0

    self.stats = pd.DataFrame(columns=['type', 'iteration', 'complexity', 'best', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])
    self.stats.loc[0] = ['none', 0, 0, True, 0, 0, 0, 0, 0]
    self.last = 0

    self.best = {
      'index': 0,
      'iteration': 0,
      'features': []
    }
    self.ts_best = {
      'index': 0,
      'ts_iteration': 0,
      'features': []
    }

  def print_results(self):
    print("="*5, " RESULTS ", "="*5)
    print(f"Complexity: {len(self.best['features'])}")
    print(f"Test score: {self.stats[self.metric][self.best['index']]:.3f}")
    print("Features")
    feat_num = 0
    for feat in self.best['features']:
      feat_num += 1
      print(f"{feat_num}. {feat}")


  def add_stop(self):
    if self.iteration - self.best['iteration'] >= self.delta:
      return False
    if len(self.all_features) == 0:
      return False
    return True

  def add_new(self, cur_features):
    best = -1
    complexity = len(cur_features) + 1
    for feat in self.all_features:
      cur_features.append(feat)

      mdl = Model(cur_features)
      mdl.fit()
      sc = mdl.score()

      self.last += 1
      self.stats.loc[self.last] = ['add', self.iteration, complexity, False, sc[0], sc[1], sc[2], sc[3], sc[4]]

      if best == -1 or self.stats[self.metric][self.last] > self.stats[self.metric][best]:
        best = self.last
        ff = feat

      cur_features.pop()

    self.stats['best'][best] = True
    return best, ff

  def add_run(self):
    best, nf = self.add_new(self.best['features'])
    # best = self.stats.loc[self.stats['iteration'] == self.iteration].sort_values(by=self.metric, ascending=False)[0].index

    self.best['features'].append(nf)
    self.all_features.remove(nf)

    if self.stats[self.metric][best] > self.stats[self.metric][self.best['index']]:
      self.best['index'] = best
      self.best['iteration'] = self.iteration

    if self.test_mode:
      print(f"--iteration {self.iteration}--")
      print(f'add feature {nf}')
      print(f'current best model index {self.best["index"]}')
      print(f'current best model metric {self.stats[self.metric][self.best["index"]]:.3f}')


  def del_stop(self):
    if self.iteration - self.best['iteration'] >= self.delta:
      return False
    if len(self.best['features']) == 0:
      return False
    return True

  def del_new(self, cur_features):
    best = -1
    complexity = len(cur_features) - 1
    for feat in cur_features:
      cur_features.remove(feat)

      mdl = Model(cur_features)
      mdl.fit()
      sc = mdl.score()

      self.last += 1
      self.stats.loc[self.last] = ['del', self.iteration, complexity, False, sc[0], sc[1], sc[2], sc[3], sc[4]]

      if best == -1 or self.stats[self.metric][self.last] > self.stats[self.metric][best]:
        best = self.last
        ff = feat

      cur_features.append(feat)

    self.stats['best'][best] = True
    return best, ff

  def del_run(self):
    best, nf = self.del_new(self.best['features'])   # add all new models
    # best = self.stats.loc[self.stats['iteration'] == self.iteration].sort_values(by=self.metric, ascending=False)[0].index

    self.best['features'].remove(nf)
    self.all_features.append(nf)

    if self.stats[self.metric][best] > self.stats[self.metric][self.best['index']]:
      self.best['index'] = best
      self.best['iteration'] = self.iteration

    if self.test_mode:
      print(f"--iteration {self.iteration}--")
      print(f'del feature {nf}')
      print(f'current best model index {self.best["index"]}')
      print(f'current best model metric {self.stats[self.metric][self.best["index"]]:.3f}')


  def is_stop(self):
    if self.ts_iteration - self.ts_best['ts_iteration'] >= self.delta:
      return False
    return True

  def run(self):
    while (self.is_stop()):
      self.ts_iteration += 1

      if self.test_mode:
        print(f"\n-----Algorithm iteration {self.ts_iteration}-----")
        print("---Add---")
      flag = len(self.all_features) > 0
      while (self.add_stop() or flag):
        flag = False
        self.iteration += 1
        self.add_run()

      if self.test_mode:
        print("---Del---")
      flag = len(self.best['features']) > 0
      while (self.del_stop() or flag):
        flag = False
        self.iteration += 1
        self.del_run()

      if self.best['index'] != self.ts_best['index']:
        self.ts_best['index'] = self.best['index']
        self.ts_best['features'] = self.best['features']
        self.ts_best['ts_iteration'] = self.ts_iteration

      if self.test_mode:
        print("---ts_iteration result---")
        print(f'ts_best model index {self.ts_best["index"]}')
        print(f'ts_best model metric {self.stats[self.metric][self.ts_best["index"]]:.3f}')
        print(f'ts_best features -', end=" ")
        for a in self.ts_best['features']:
          print(a, end=" ")

    rs.stats = rs.stats.loc[rs.stats['type']!='none']


# Graphs

In [9]:
def my_points():
  for i in range(1, rs.iteration):
    res = rs.stats.loc[rs.stats['iteration']==i].loc[rs.stats['best']]
    line_compl.append(res['complexity'].values[0])
    line_metr.append(res[rs.metric].values[0])

In [10]:
def line():
  dots = go.Scatter(x=rs.stats['complexity'], y=rs.stats[rs.metric], mode='markers',
                  marker=dict(color='grey', size=10, opacity=0.8), name='Metrics')

  max_line = go.Scatter(x=line_compl, y=line_metr, mode='lines',
                        line=dict(color="crimson", width=2), name='Maximal line')

  layout = go.Layout(
      title='Dependence of the metric value on the number of features',
      xaxis=dict(title='Featues', range=[ -0.5, max(line_compl) + 0.5 ], dtick=1),
      yaxis=dict(title='Metrics'),
      showlegend=True,
      width=GRAPH_WIDTH,
      height=GRAPH_HEIGHT
  )

  fig = go.Figure(data=[dots, max_line], layout=layout)

  fig.show()


In [12]:
def roc():
  mdl = Model(rs.best['features'])
  mdl.fit()
  y_pred_test_proba = mdl.y_pred_test_prob()
  y_pred_train_proba = mdl.y_pred_train_prob()

  fpr, tpr, thresholds = roc_curve(y_test, y_pred_test_proba)
  roc_auc = auc(fpr, tpr)

  roc_trace_test = go.Scatter(x=fpr, y=tpr, mode='lines',
                              name='ROC curve test (area = %0.2f)' % roc_auc)

  fpr, tpr, thresholds = roc_curve(y_train, y_pred_train_proba)
  roc_auc = auc(fpr, tpr)
  roc_trace_train = go.Scatter(x=fpr, y=tpr, mode='lines',
                              name='ROC curve train (area = %0.2f)' % roc_auc)

  random_trace = go.Scatter(x=[0, 1], y=[0, 1], mode='lines',
                            name='Random', line=dict(dash='dash'))

  layout = go.Layout(
      title='Random searching with adaptation',
      xaxis=dict(title='False Positive Rate'),
      yaxis=dict(title='True Positive Rate'),
      showlegend=True,
      width=GRAPH_WIDTH,
      height=GRAPH_HEIGHT
  )

  fig = go.Figure(data=[roc_trace_test, roc_trace_train, random_trace],
                  layout=layout)

  fig.show()

# Test

In [13]:
rs = Add_Dell(DELTA, ALL_FEATURES, COUNT_FEATURES, 'f1', True)
# rs = Add_Dell(DELTA, ALL_FEATURES, COUNT_FEATURES)
rs.run()


-----Algorithm iteration 1-----
---Add---
--iteration 1--
add feature Online boarding
current best model index 7
current best model metric 0.782
--iteration 2--
add feature Type of Travel
current best model index 48
current best model metric 0.817
--iteration 3--
add feature Inflight wifi service
current best model index 74
current best model metric 0.841
--iteration 4--
add feature Class_Eco Plus
current best model index 110
current best model metric 0.842
--iteration 5--
add feature Departure Delay in Minutes_0-30
current best model index 110
current best model metric 0.842
--iteration 6--
add feature Age_65-100
current best model index 110
current best model metric 0.842
---Del---
--iteration 7--
del feature Departure Delay in Minutes_0-30
current best model index 110
current best model metric 0.842
---ts_iteration result---
ts_best model index 110
ts_best model metric 0.842
ts_best features - Type of Travel Class_Eco Plus Age_65-100 Inflight wifi service Online boarding 
-----Algo

In [14]:
rs.print_results()

=====  RESULTS  =====
Complexity: 5
Test score: 0.842
Features
1. Inflight wifi service
2. Type of Travel
3. Age_65-100
4. Class_Eco Plus
5. Online boarding


In [15]:
# Расчеты для построения линии максимума
line_compl = []
line_metr = []
my_points()
print(line_metr)
print(line_compl)

[0.7820171771680475, 0.8170135895225191, 0.8405785325424555, 0.8416083346898909, 0.8416083346898909, 0.8415906127770535, 0.8415906127770535, 0.8415906127770535, 0.8416083346898909, 0.8415906127770535]
[1, 2, 3, 4, 5, 6, 5, 6, 5, 6]


In [16]:
# График зависимости метрики от числа признаков
line()

In [17]:
# График ROC-кривой
roc()