In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

In [37]:
df = pd.read_csv('KMeansclusters.csv')
df.sample(5)

Unnamed: 0,Лот ID,Материал,Класс,supplier_id,latitude,longitude,kmeans_clust
110303,100000587011,770000528154,2107,103322,55.4311,37.5461,229
122141,100000554148,770000540920,210502,58691,56.8575,60.6125,267
288467,100000577392,770001307893,21050101,54948,55.7522,37.6156,12
146370,100000593210,770000641264,21050370,63104,55.7172,38.2078,87
194286,100000591493,770000826212,2110,78951,57.4569,49.5319,98


In [38]:
lot_to_cluster = {}
for i in np.unique(df['Лот ID']):
    cnts = df.loc[df['Лот ID'] == i, 'kmeans_clust'].value_counts()
    lot_to_cluster[i] = cnts.keys()[cnts.argmax()]

In [40]:
df = df.assign(target=df['Лот ID'].apply(lambda x: lot_to_cluster[x]))
df.head()

Unnamed: 0,Лот ID,Материал,Класс,supplier_id,latitude,longitude,kmeans_clust,target
0,100000517882,1118280,2170,75422,55.0411,82.9344,80,143
1,100000517882,1118280,2170,94293,60.1256,29.8764,52,143
2,100000517882,1118280,2170,94293,60.1256,29.8764,52,143
3,100000517882,1118280,2170,94293,60.1256,29.8764,52,143
4,100000517882,1118280,2170,94501,55.4647,37.6981,172,143


In [41]:
target = df['target']
df = df.drop(columns=['Лот ID', 'kmeans_clust', 'target', 'supplier_id'])
df

Unnamed: 0,Материал,Класс,latitude,longitude
0,1118280,2170,55.0411,82.9344
1,1118280,2170,60.1256,29.8764
2,1118280,2170,60.1256,29.8764
3,1118280,2170,60.1256,29.8764
4,1118280,2170,55.4647,37.6981
...,...,...,...,...
339943,770001801168,21050301,61.2412,77.4858
339944,770001801168,21050301,61.2412,77.4858
339945,980004700197,2170,63.1667,75.6167
339946,980004700197,2170,63.1667,75.6167


In [42]:
df = pd.DataFrame({'class_id': df['Класс'], 'material_id': df['Материал'], 'supplier_address_latitude': df['latitude'], 'supplier_address_longitude': df['longitude']})

In [43]:
cat_feat = ['material_id', 'class_id']
numeric_features = ['supplier_address_latitude', 'supplier_address_longitude']

In [44]:
ct = ColumnTransformer([('encode_cats', OneHotEncoder(handle_unknown='ignore'), cat_feat),('scale', StandardScaler(), numeric_features)], remainder='passthrough')
model = make_pipeline(
    ct,
    RandomForestClassifier(n_jobs=-2)
)

In [45]:
model.fit(df, target)

In [52]:
import pickle

In [53]:
with open('HumanClassifier.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
import pickle

class HumanLotting:
    def __init__(self, path = './Humanclassifier.pkl'):
        with open('classifier.pkl', 'rb') as file:
            self.__model = pickle.load(file)
    def solve(data):
        df = data.drop(columns=['request_id', 'order_dt', 'delivery_dt', 'receiver_id', 'receiver_address_id', 
                                'receiver_address', 'receiver_address_latitude', 'receiver_address_latitude', 
                                'receiver_address_longitude', 'receiver_address_coords_geo_confidence', 
                                'class_name', 'standard_shipping', 'material_name', 'material_name', 'measure_unit',
                                'materials_amount', 'material_price', 'item_cost', 'purchase_method', 'order_id', 
                                'item_id', 'client_id', 'supplier_id', 'supplier_address_id', 'supplier_address', 
                                'supplier_address_coords_geo_confidence']
        preds = self.__model.predict(df.drop(columns = ['request_id']))
        df = df.drop(columns=['material_id', 'class_id', 'supplier_address_latitude', 'supplier_address_longitude']).assign(lot_id=preds)
        request_to_lot = {}
        for i in np.unique(df['request_id']):
            cnts = df.loc[df['request_id'] == i, 'lot_id'].value_counts()
            request_to_lot[i] = cnts.keys()[cnts.argmax()]
        return pd.DataFrame({'request_id': list(request_to_lot.keys(), 'human_lot_id': list(request_to_lot.values()})