### Борьба с несбалансированностью классов с помощью модуля NEARMISS
https://habr.com/ru/articles/562322/

+ Метод NearMiss — это метод недостаточной выборки. 
+ Он пробует сбалансировать распределение классов путём случайного исключения наблюдений из бо́льших классов. 
+ Если экземпляры из двух разных классов очень похожи между собой, метод удаляет наблюдение из мажоритарного класса.

In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from imblearn.under_sampling import NearMiss

import warnings
warnings.filterwarnings("ignore")

In [25]:
file = 'online_shoppers_intention.csv'
df = pd.read_csv(file)
df.head().T

Unnamed: 0,0,1,2,3,4
Administrative,0,0,0,0,0
Administrative_Duration,0.0,0.0,0.0,0.0,0.0
Informational,0,0,0,0,0
Informational_Duration,0.0,0.0,0.0,0.0,0.0
ProductRelated,1,2,1,2,10
ProductRelated_Duration,0.0,64.0,0.0,2.666667,627.5
BounceRates,0.2,0.0,0.2,0.05,0.02
ExitRates,0.2,0.1,0.2,0.14,0.05
PageValues,0.0,0.0,0.0,0.0,0.0
SpecialDay,0.0,0.0,0.0,0.0,0.0


In [26]:
df.shape

(12330, 18)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

`Revenue` - столбец для прогнозирования

In [28]:
df['Revenue'].value_counts()

False    10422
True      1908
Name: Revenue, dtype: int64

In [29]:
# разделим наблюдения на обучающую и тестовую выборки
Y = df['Revenue']
X = df.drop('Revenue', axis=1)

feature_names = X.columns
feature_names

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
       'Weekend'],
      dtype='object')

In [33]:
X = pd.get_dummies(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 28 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Administrative                 12330 non-null  int64  
 1   Administrative_Duration        12330 non-null  float64
 2   Informational                  12330 non-null  int64  
 3   Informational_Duration         12330 non-null  float64
 4   ProductRelated                 12330 non-null  int64  
 5   ProductRelated_Duration        12330 non-null  float64
 6   BounceRates                    12330 non-null  float64
 7   ExitRates                      12330 non-null  float64
 8   PageValues                     12330 non-null  float64
 9   SpecialDay                     12330 non-null  float64
 10  OperatingSystems               12330 non-null  int64  
 11  Browser                        12330 non-null  int64  
 12  Region                         12330 non-null 

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                                                   test_size=0.3,
                                                   random_state=97)

In [35]:
# смотрим на размерность сформированных наборов данных
print(f'Размерность набора данных X_train:{X_train.shape}')
print(f'Размерность набора данных y_train:{y_train.shape}')
print(f'Размерность набора данных X_test:{X_test.shape}')
print(f'Размерность набора данных y_test:{y_test.shape}')

Размерность набора данных X_train:(8631, 28)
Размерность набора данных y_train:(8631,)
Размерность набора данных X_test:(3699, 28)
Размерность набора данных y_test:(3699,)


Посчитаем логистическую регрессию

In [47]:
lregress1 = LogisticRegression()
lregress1.fit(X_train, y_train.ravel())
prediction = lregress1.predict(X_test)
print(classification_report(y_test, prediction))


              precision    recall  f1-score   support

       False       0.89      0.97      0.93      3125
        True       0.73      0.37      0.49       574

    accuracy                           0.88      3699
   macro avg       0.81      0.67      0.71      3699
weighted avg       0.87      0.88      0.86      3699



In [51]:
print(f'Количество True:{sum(y_train==True)}')
print(f'Количество False:{sum(y_train==False)}')

Количество True:1334
Количество False:7297


In [53]:
nm = NearMiss()
X_train_miss, y_train_miss = nm.fit_resample(X_train, y_train.ravel())
print(f'Количество True после применения:{sum(y_train_miss==True)}')
print(f'Количество False:{sum(y_train_miss==False)}')

Количество True после применения:1334
Количество False:1334


In [55]:
lregress2 = LogisticRegression()
lregress2.fit(X_train_miss, y_train_miss.ravel())
prediction = lregress2.predict(X_test)
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

       False       0.98      0.47      0.64      3125
        True       0.25      0.96      0.40       574

    accuracy                           0.55      3699
   macro avg       0.62      0.72      0.52      3699
weighted avg       0.87      0.55      0.60      3699

