Add

https://colab.research.google.com/drive/1b6ubEvtESajClbxcc7YZ7vtVhj2mdidz

# Imports + Data

In [25]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import roc_curve, auc

import plotly.graph_objects as go
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

In [4]:
STD_DATA_PATH = 'https://raw.githubusercontent.com/auplt/Flight-Satisfaction/data/data_std.pkl'
Y_FEATURE = 'satisfaction'
TEST_SIZE = 0.3

In [5]:
# Загрузка данных
df = pd.read_pickle(STD_DATA_PATH)

X, y = df.drop([Y_FEATURE], axis=1), df[Y_FEATURE]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y)

y_train = y_train.to_frame()
y_test = y_test.to_frame()

X_train = X_train.reset_index().drop(columns='index')
X_test = X_test.reset_index().drop(columns='index')
y_train = y_train.reset_index().drop(columns='index')
y_test = y_test.reset_index().drop(columns='index')

In [6]:
DELTA = 2
ALL_FEATURES = list(X_train.columns)
COUNT_FEATURES = len(ALL_FEATURES)

# Model

In [7]:
C = 0.001
TOL = 100
CLASS_WEIGHT = "balanced"

In [8]:
CLF = LogisticRegression(C=C, tol=TOL, class_weight=CLASS_WEIGHT)
CLF

In [21]:
class Model():
  def __init__(self, columns):
    self.clf = CLF
    self.subset_X_train = X_train[columns]
    self.subset_X_test =  X_test[columns]

  def fit(self):
    self.clf.fit(self.subset_X_train, y_train)

  def y_pred_train(self):
    return self.clf.predict(self.subset_X_train)

  def y_pred_test(self):
    return self.clf.predict(self.subset_X_test)

  def train_score(self):
    return f1_score(y_train, self.y_pred_train())

  def test_score(self):
    return f1_score(y_test, self.y_pred_test())

  def y_pred_train_prob(self):
    return self.clf.predict_proba(self.subset_X_train)[:, 1]

  def y_pred_test_prob(self):
    return self.clf.predict_proba(self.subset_X_test)[:, 1]

# Add-Del

In [10]:
class Add_Dell():
  def __init__(self, test_mode=False, debug=False, delta=DELTA,
               all_features=ALL_FEATURES, count_features=COUNT_FEATURES):
     self.test_mode = test_mode
     self.debug = debug
     self.all_features = all_features.copy()
     self.count_features = count_features
     self.iteration = 0
     self.delta = delta
     self.ts_iteration = 0

     self.complexity = 0
     self.test_score = -1

     self.best = {
        'complexity': 0,
        'test_score': -1,
        'train_score': 0,
        'features': []
     }
     self.metrics_graph = list()
     self.complexity_graph = list()


  def is_stop(self):
    if self.iteration >= self.delta:
      if self.test_mode or self.debug:
        print("Program finished as score stabilized")
      return True

    if self.complexity >= self.count_features:
      if self.test_mode or self.debug:
        print("Program finished as maximum complexity reached")
      return True


  def print_results(self):
    print("="*5, " RESULTS ", "="*5)
    print(f"Complexity: {self.best['complexity']}")
    print(f"Test score: {self.best['test_score']:.3f}")
    print(f"Train score: {self.best['train_score']:.3f}")
    print("Features")
    feat_num = 0
    for feat in self.best['features']:
      feat_num += 1
      print(f"{feat_num}. {feat}")


  def add_stop(self):
    # if abs(add_test_score - self.best['test_score']) <= self.delta:
    #   if self.test_mode or self.debug:
    #     print("Program finished as score stabilized")
    #   return True

    if self.ts_iteration >= self.delta:
      if self.test_mode or self.debug:
        print("Program finished as score stabilized")
      return True

    if self.best['complexity'] >= self.count_features:
      if self.test_mode or self.debug:
        print("Program finished as maximum complexity reached")
      return True


  def add_best_feature(self, cur_features):
    best = {'test_score': 0,
            'train_score': 0,
            'feat': None}

    for feat in self.all_features:
      cur_features.append(feat)

      mdl = Model(cur_features)
      mdl.fit()

      # Adding to lists to build a graph
      self.metrics_graph.append(mdl.test_score())
      self.complexity_graph.append(self.complexity)

      # Searching for the best
      if mdl.test_score() > best['test_score']:
        best['test_score'] = mdl.test_score()
        best['train_score'] = mdl.train_score()
        best['feat'] = feat

      cur_features.pop()

    return best


  def add(self):
    add_test_score = self.best['test_score']
    cur_features = self.best['features']
    res = self.add_best_feature(cur_features)

    if self.test_mode or self.debug:
      print(f"The res result\n{res}")

    if res['test_score'] >= self.best['test_score']:
      self.best['test_score'] = res['test_score']
      self.best['train_score'] = res['train_score']
      self.best['features'].append(res['feat'])
      self.best['complexity'] += 1
      self.all_features.remove(res['feat'])

    if add_test_score == self.best['test_score']:
      self.ts_iteration += 1
    else:
      self.ts_iteration = 0

    if self.test_mode or self.debug:
      print(f"The best result for complexity \
        {self.best['complexity']}\n\t{self.best}")


  def dell_stop(self):
    # if abs(dell_test_score - self.best['test_score']) <= self.delta:
    #   if self.test_mode or self.debug:
    #     print("Program finished as score stabilized")
    #   return True

    if self.ts_iteration >= self.delta:
      if self.test_mode or self.debug:
        print("Program finished as score stabilized")
      return True

    if self.best['complexity'] <= 0:
      if self.test_mode or self.debug:
        print("Program finished as maximum complexity reached")
      return True


  def dell_worst_feature(self, cur_features):
    worst = {'test_score': 1,
            'train_score': 0,
            'feat': None}
    cur_feat = cur_features
    for feat in cur_features:
      cur_feat.remove(feat)

      mdl = Model(cur_feat)
      mdl.fit()

      # Adding to lists to build a graph
      self.metrics_graph.append(mdl.test_score())
      self.complexity_graph.append(self.complexity)

      # Searching for the worst
      if mdl.test_score() < worst['test_score']:
        worst['test_score'] = mdl.test_score()
        worst['train_score'] = mdl.train_score()
        worst['feat'] = feat

      cur_feat.append(feat)

    return worst

  def dell(self):
    dell_test_score = self.best['test_score']
    cur_features = self.best['features']
    res = self.dell_worst_feature(cur_features)

    if self.test_mode or self.debug:
      print(f"The res result\n{res}")

    if res['test_score'] > self.best['test_score']:
      self.best['test_score'] = res['test_score']
      self.best['train_score'] = res['train_score']
      self.best['features'].remove(res['feat'])
      self.best['complexity'] -= 1
      self.all_features.append(res['feat'])

    if dell_test_score == self.best['test_score']:
      self.ts_iteration += 1
    else:
      self.ts_iteration = 0

    if self.test_mode or self.debug:
      print(f"The best result for complexity \
        {self.best['complexity']}\n\t{self.best}")


  def run(self):
    while (not self.is_stop()):
      self.complexity = self.best['complexity']
      self.test_score = self.best['test_score']

      if self.test_mode or self.debug:
        print(f"-----Algorithm iteration {self.iteration}-----\n---Add---")
      self.ts_iteration = 0
      while (not self.add_stop()):
        self.add()

      if self.test_mode or self.debug:
        print("---Dell---")
      self.ts_iteration = 0
      while (not self.dell_stop()):
        self.dell()

      if self.test_score == self.best['test_score']:
        self.iteration += 1
      else:
        self.iteration = 0

    self.print_results()

# Graphs

In [11]:
GRAPH_WIDTH = 700
GRAPH_HEIGHT = 500

In [12]:
def max_points():
  for i in line_compl:
    metr_max = 0
    for j in range(len(rs.metrics_graph)):
      if i == rs.complexity_graph[j]:
        if rs.metrics_graph[j] > metr_max:
          metr_max = rs.metrics_graph[j]
    line_metr.append(metr_max)

In [13]:
def line():
  dots = go.Scatter(x=rs.complexity_graph, y=rs.metrics_graph, mode='markers',
                  marker=dict(color='grey', size=10, opacity=0.8),
                  name='Metrics')

  max_line = go.Scatter(x=line_compl, y=line_metr, mode='lines',
                        line=dict(color="crimson", width=2), name='Maximal line')

  layout = go.Layout(
      title='Dependence of the metric value on the number of features',
      xaxis=dict(title='Featues', range=[ -0.5, rs.complexity + 0.5 ], dtick=1),
      yaxis=dict(title='Metrics'),
      showlegend=True,
      width=GRAPH_WIDTH,
      height=GRAPH_HEIGHT
  )

  fig = go.Figure(data=[dots, max_line], layout=layout)

  fig.show()


In [23]:
def roc():
  mdl = Model(rs.best['features'])
  mdl.fit()
  y_pred_test_proba = mdl.y_pred_test_prob()
  y_pred_train_proba = mdl.y_pred_train_prob()

  fpr, tpr, thresholds = roc_curve(y_test, y_pred_test_proba)
  roc_auc = auc(fpr, tpr)

  roc_trace_test = go.Scatter(x=fpr, y=tpr, mode='lines',
                              name='ROC curve test (area = %0.2f)' % roc_auc)

  fpr, tpr, thresholds = roc_curve(y_train, y_pred_train_proba)
  roc_auc = auc(fpr, tpr)
  roc_trace_train = go.Scatter(x=fpr, y=tpr, mode='lines',
                              name='ROC curve train (area = %0.2f)' % roc_auc)

  random_trace = go.Scatter(x=[0, 1], y=[0, 1], mode='lines',
                            name='Random', line=dict(dash='dash'))

  layout = go.Layout(
      title='Random searching with adaptation',
      xaxis=dict(title='False Positive Rate'),
      yaxis=dict(title='True Positive Rate'),
      showlegend=True,
      width=GRAPH_WIDTH,
      height=GRAPH_HEIGHT
  )

  fig = go.Figure(data=[roc_trace_test, roc_trace_train, random_trace],
                  layout=layout)

  fig.show()

# Test

In [20]:
# rs = Add_Dell(test_mode=True)
rs = Add_Dell()
best_rs = rs.run()


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

=====  RESULTS  =====
Complexity: 5
Test score: 0.845
Train score: 0.841
Features
1. Type of Travel
2. Class_Eco Plus
3. Online boarding
4. Departure Delay in Minutes_0-30
5. Inflight wifi service


In [24]:
# Расчеты для построения линии максимума
line_metr = list()
line_compl = sorted(list(set(rs.complexity_graph)))
max_points()
print(line_metr)
print(line_compl)

# График зависимости метрики от числа признаков
line()

[0.8448499594484996, 0.8448499594484996]
[0, 5]



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [None]:
# График ROC-кривой
roc()