In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
RANDOM_STATE = 22
TEST_SIZE = 0.2

## Работа с данными

In [3]:
data = pd.read_csv('data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4744 entries, 0 to 4743
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   brand                  4744 non-null   object 
 1   model                  4744 non-null   object 
 2   description            4744 non-null   object 
 3   price                  4744 non-null   float64
 4   exposition_days_count  4744 non-null   int64  
 5   images_count           4744 non-null   int64  
 6   label                  4744 non-null   object 
dtypes: float64(1), int64(2), object(4)
memory usage: 259.6+ KB


In [4]:
data['label'].value_counts(normalize=True)

good            0.348440
bad             0.316610
excellent       0.187184
no_data         0.101602
satisfactory    0.028246
new             0.017917
Name: label, dtype: float64

In [5]:
X, y = data.drop(columns='label'), data['label']

### Разбиение датасета. Модуль model_selection.train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

In [7]:
print(f'Размер данных для обучения: признаки - {X_train.shape}, таргеты - {y_train.shape}')
print(f'Размер данных для теста: признаки - {X_test.shape}, таргеты - {y_test.shape}')

Размер данных для обучения: признаки - (3795, 6), таргеты - (3795,)
Размер данных для теста: признаки - (949, 6), таргеты - (949,)


In [8]:
y_train.value_counts(normalize=True)

good            0.348353
bad             0.316733
excellent       0.187088
no_data         0.101713
satisfactory    0.028195
new             0.017918
Name: label, dtype: float64

### Работа с числовыми признаками. Модуль preprocessing

In [9]:
numeric_features = ['price', 'exposition_days_count', 'images_count']

scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[numeric_features])
X_test_numeric = scaler.transform(X_test[numeric_features])

X_train_numeric

array([[ 0.12401256,  1.70079258,  1.639863  ],
       [ 1.16315527,  0.88391807,  1.29535651],
       [-0.22236834,  0.65797406,  0.49150803],
       ...,
       [-0.45097974,  1.59651073,  0.37667253],
       [-0.53411116, -0.24580199, -0.65684694],
       [ 1.85591708,  0.43203005, -0.42717594]])

In [10]:
scaler.mean_ # среднее значение для каждого признака

array([8209.88168643,  102.14255599,   15.7198946 ])

In [11]:
scaler.var_  # стандратное отклоение для каждого признака

array([2.08368528e+08, 3.31043475e+03, 7.58311589e+01])

### Работа с категориальными признаками. Модуль preprocessing

In [12]:
cat_features = ['brand', 'model']

enc = OneHotEncoder(handle_unknown='ignore')
X_train_cat = enc.fit_transform(X_train[cat_features])
X_test_cat = enc.transform(X_test[cat_features])

X_test_cat

<949x2624 sparse matrix of type '<class 'numpy.float64'>'
	with 1427 stored elements in Compressed Sparse Row format>

In [13]:
X_test_cat.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
enc.categories_

[array(['AHB', 'AIWA', 'AKAI', 'AMCV', 'AOC', 'AVQ', 'Acer', 'Akira',
        'Aquatelevision', 'Asano', 'Avest', 'BBK', 'BORK', 'BQ', 'BRAVIS',
        'Bang & Olufsen', 'Beko', 'Blaupunkt', 'Bose', 'CASIO', 'CENTEK',
        'Cameron', 'Candy', 'Changhong', 'Conrac', 'DENN', 'DEXP', 'DIGMA',
        'DNS', 'Daewoo Electronics', 'Doffler', 'ECON', 'Elekta',
        'Elenberg', 'Erisson', 'Evgo', 'Fujitsu', 'Funai', 'Fusion',
        'General', 'GoldStar', 'Grundig', 'HAMBER', 'HARPER', 'HARTENS',
        'HEC', 'HOLLEBERG', 'HUAWEI', 'Haier', 'Helix', 'Hi', 'Hisense',
        'Hitachi', 'Horizont', 'Hyundai', 'Irbis', 'Izumi', 'JVC', 'KIVI',
        'LEBEN', 'LG', 'Leff', 'Lentel', 'Loewe', 'MYSTERU', 'Mystery',
        'NEKO', 'Novex', 'OK.', 'OKARI', 'Olto', 'Orion', 'Panasonic',
        'Philips', 'Pioneer', 'Polar', 'Polarline', 'Premiera',
        'Prestigio', 'Prology', 'Rolsen', 'Rotex', 'Ruimatech', 'Runco',
        'STARWIND', 'SUPRA', 'SUZUKI', 'Samsung', 'Samtron', 'Sanyo',

### Работа с текстовыми признаками. Модуль feature_extraction

In [15]:
text_feature = 'description'

vectorizer = TfidfVectorizer()
X_train_text = vectorizer.fit_transform(X_train[text_feature])
X_test_text = vectorizer.transform(X_test[text_feature])

In [16]:
vectorizer.vocabulary_

{'телевизор': 6539,
 'bbk': 902,
 '32': 398,
 'диагональ': 2611,
 'пульта': 5594,
 'коробка': 3484,
 'документы': 2692,
 'на': 3957,
 'фото': 6979,
 'нет': 4180,
 'они': 4420,
 'коробке': 3485,
 'продам': 5482,
 'samsung': 1424,
 'хорошем': 7032,
 'состоянии': 6297,
 'отличном': 4570,
 '43': 535,
 'хороший': 7034,
 'рабочий': 5645,
 'продаю': 5487,
 'связи': 6013,
 'покупкой': 5023,
 'нового': 4220,
 'телевизора': 6542,
 'смарт': 6187,
 'состояние': 6296,
 'очень': 4628,
 'хорошее': 7029,
 'пульт': 5593,
 'следы': 6159,
 'от': 4511,
 'детских': 2586,
 'зубов': 3046,
 'шнур': 7257,
 'питания': 4751,
 '102': 45,
 'см': 6185,
 'отличии': 4564,
 'более': 1909,
 'поздних': 4981,
 'моделей': 3886,
 'здесь': 3023,
 'установлена': 6886,
 'металлическая': 3820,
 'задняя': 2905,
 'панель': 4652,
 'что': 7209,
 'делает': 2556,
 'его': 2798,
 'достаточно': 2744,
 'увесистым': 6760,
 'для': 2660,
 'установки': 6883,
 'подставку': 4953,
 'без': 1837,
 'сколов': 6141,
 'дефектов': 2593,
 'изображения

In [17]:
X_test_text

<949x7385 sparse matrix of type '<class 'numpy.float64'>'
	with 9568 stored elements in Compressed Sparse Row format>

### Объединение преобразований над признками. Модуль compose

In [18]:
preprocessor = ColumnTransformer(
     transformers=[
         ('scaler', StandardScaler(), numeric_features),
         ('text', TfidfVectorizer(), text_feature), 
         ('category', OneHotEncoder(handle_unknown='ignore'), cat_features),
     ],
)

X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

In [19]:
X_train_prepared

<3795x10012 sparse matrix of type '<class 'numpy.float64'>'
	with 65876 stored elements in Compressed Sparse Row format>

## Модели для задачи классификации

### Строим baseline. Модуль dummy

In [20]:
dummy_clf = DummyClassifier(strategy="most_frequent")  # будем предсказывать самый частотный таргет
dummy_clf.fit(X_train_prepared, y_train)

In [21]:
y_pred = dummy_clf.predict(X_test_prepared)
y_probas = dummy_clf.predict_proba(X_test_prepared)

print(y_pred[:10])
print(y_probas[:10])

['good' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good' 'good']
[[0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [22]:
dummy_clf.score(X_test_prepared, y_test)

0.3487881981032666

### Классификаторы из модуля linear_model

In [23]:
logreg_clf = LogisticRegression(
       random_state=RANDOM_STATE, multi_class='multinomial', class_weight='balanced'
    ).fit(X_train_prepared, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
logreg_clf.coef_

array([[-1.50917555, -0.00423185,  0.01900292, ..., -0.00876521,
        -0.07997374, -0.30986047],
       [ 0.573249  , -0.02288234, -0.06158463, ..., -0.0101862 ,
        -0.02821366,  0.11636511],
       [ 0.07817381,  0.01122688, -0.06074481, ...,  0.04001644,
         0.27376362, -0.23544565],
       [ 0.66393446,  0.13122188,  0.09245261, ..., -0.00183685,
        -0.02224847, -0.11926628],
       [ 0.25341185, -0.00430818, -0.0854775 , ..., -0.00656167,
        -0.09881612, -0.24427704],
       [-0.05959358, -0.11102638,  0.09635142, ..., -0.01266651,
        -0.04451162,  0.79248432]])

In [25]:
logreg_clf.score(X_test_prepared, y_test)

0.8861959957850368

In [26]:
sgd_clf = SGDClassifier(random_state=RANDOM_STATE).fit(X_train_prepared, y_train)
sgd_clf.score(X_test_prepared, y_test)

0.8956796628029505

In [31]:
def get_feature_names(preprocessor: ColumnTransformer):
    """
    Отдает названия признаков после метода fit / fit_transform в ColumnTransformer
    """
    features = []
    for _, transformer, transformer_features, _ in preprocessor._iter(fitted=True):

        if isinstance(transformer, str):
            continue

        if hasattr(transformer, 'get_feature_names_out'):
            transformer_features = transformer.get_feature_names_out()

        for feature in transformer_features:
            features.append(feature)

    return features

In [30]:
sorted(enumerate(sgd_clf.coef_[1]), key=lambda x: x[1], reverse=True)

[(4573, 10.093407266889852),
 (3076, 5.6393421659505805),
 (4571, 5.547349563417023),
 (6300, 4.017565799562711),
 (4575, 3.7293227025892093),
 (3075, 3.363770987726394),
 (4569, 3.2063290380824814),
 (6299, 2.569606668817622),
 (3293, 2.0518147214941553),
 (5310, 2.047404108516998),
 (3069, 2.0284317933094167),
 (4742, 1.585561395320911),
 (2663, 1.4869830148825045),
 (9565, 1.4304477301395413),
 (7834, 1.4304477301395406),
 (6016, 1.392883547644703),
 (1107, 1.3549886448142574),
 (6504, 1.336569243424383),
 (4662, 1.3140229299113824),
 (9458, 1.3004070273995842),
 (8214, 1.300407027399584),
 (9336, 1.300407027399582),
 (8066, 1.3004070273995816),
 (8189, 1.30040702739958),
 (1926, 1.2151474908423883),
 (3734, 1.2016727113845265),
 (591, 1.1879528297802244),
 (7068, 1.1793726527224968),
 (9679, 1.1703663246596265),
 (8340, 1.1703663246596263),
 (9750, 1.1703663246596256),
 (9657, 1.1703663246596254),
 (7943, 1.1703663246596252),
 (9182, 1.1703663246596252),
 (9258, 1.1703663246596245)

In [41]:
get_feature_names(preprocessor)[5310]

'прекрасном'

## Метрики. Модуль metric

In [42]:
y_pred = sgd_clf.predict(X_test_prepared)
# >>> print(f1_score(y_test, y_pred))

In [43]:
print(f1_score(y_test, y_pred, average='weighted'))

0.8862867630683107


In [44]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         bad       0.96      0.96      0.96       300
   excellent       0.90      0.94      0.92       178
        good       0.88      0.94      0.91       331
         new       0.91      0.59      0.71        17
     no_data       0.76      0.71      0.74        96
satisfactory       0.50      0.11      0.18        27

    accuracy                           0.90       949
   macro avg       0.82      0.71      0.74       949
weighted avg       0.89      0.90      0.89       949



## Поиск оптимальных гиперпараметров. Модуль model_selection

In [45]:
parameters = {
    'loss': [
        'hinge',
        'modified_huber',
        'squared_hinge',
        'perceptron',
        'huber',
    ],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'max_iter': [100, 500, 1000],
    'learning_rate': ['constant', 'optimal', 'adaptive'],
    'eta0': [0.1, 0.5]
}

In [46]:
grid_search = GridSearchCV(sgd_clf, parameters)
grid_search.fit(X_train_prepared, y_train)



In [47]:
grid_search.best_params_

{'eta0': 0.5,
 'learning_rate': 'adaptive',
 'loss': 'hinge',
 'max_iter': 100,
 'penalty': 'l1'}

In [48]:
y_pred = grid_search.best_estimator_.predict(X_test_prepared)
f1_score(y_pred, y_test, average='weighted')

0.9163474187708154

In [49]:
parameters = {
    'eta0': [0.1, 0.2, 0.3, 0.4, 0.5],
    'learning_rate': ['constant', 'optimal', 'adaptive'],
    'loss': ['hinge', 'modified_huber', 'squared_hinge','perceptron', 'huber'],
    'max_iter': [50, 100, 200, 500, 1000],
    'penalty': ['l2', 'l1', 'elasticnet']
}

grid_search = GridSearchCV(sgd_clf, parameters)

In [50]:
grid_search.fit(X_train_prepared, y_train)



In [52]:
grid_search.best_params_

{'eta0': 0.3,
 'learning_rate': 'adaptive',
 'loss': 'perceptron',
 'max_iter': 50,
 'penalty': 'l1'}

## Объединение преобразований над признаками с обучением модели. Модуль pipeline 

In [49]:
pipeline = Pipeline(
     steps=[
         (
             'preprocessor', ColumnTransformer(
                 transformers=[
                     ('text', TfidfVectorizer(), text_feature), 
                     ('category', OneHotEncoder(handle_unknown='ignore'), cat_features),
                 ],
             )
         ),
         ('classifier', SGDClassifier(
             eta0=0.5,
             learning_rate='adaptive',
             loss='hinge',
             max_iter=100,
             penalty='l1',
             random_state=RANDOM_STATE,
         )
         ),
     ],
 )
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [50]:
f1_score(y_pred, y_test, average='weighted')

0.8969529850491467

### Приложение к задаче

In [51]:
>>> def get_feature_names(preprocessor: ColumnTransformer):
    """
    Отдает названия признаков после метода fit / fit_transform в ColumnTransformer
    """
    features = []
    for _, transformer, transformer_features, _ in preprocessor._iter(fitted=True):
        
        if isinstance(transformer, str):
            continue
            
        if hasattr(transformer, 'get_feature_names_out'):
            transformer_features = transformer.get_feature_names_out()    
            
        if hasattr(transformer, 'get_feature_names'):
            transformer_features = transformer.get_feature_names()            
            
        for feature in transformer_features:
            features.append(feature)
            
    return features

In [15]:
# get_feature_names(preprocessor)