In [None]:
!wget https://lodmedia.hb.bizmrg.com/case_files/1113064/train_dataset_train_data_Mediawise.zip
!unzip /content/train_dataset_train_data_Mediawise.zip

In [None]:
!pip install -q dill

In [None]:
import pandas as pd
import numpy as np
from joblib import dump, load
import dill
import ast
from sklearn.metrics.pairwise import cosine_similarity, rbf_kernel, manhattan_distances, euclidean_distances
import lightgbm as lgb

In [None]:
train = pd.read_json('/content/train_data_Mediawise/train_data.json')

In [None]:
for cnt in ['gender', 'ageFrom', 'ageTo', 'income']:
    train[cnt] = train['targetAudience'].apply(lambda x: x[cnt])
train['ageFrom,ageTo'] = train.apply(lambda x: [x['ageFrom'], x['ageTo']], axis=1)
gdic = dict(zip(train['gender'].unique(), [['female', 'male'], ['female'], ['male']]))
locs = pd.Series(pd.Series(train['points'].agg(sum)).astype(str).unique()).apply(lambda x: ast.literal_eval(x))
locs = locs[euclidean_distances(np.array([locs.apply(lambda x: float(x['lat'])).tolist(),
                                          locs.apply(lambda x: float(x['lon'])).tolist()]).T).mean(axis=1) < 2]

In [None]:
class DensityRegressor():
    '''Ансамблевая модель оценки плотности, чтобы получить готовый регрессор:
    dr=DensityRegressor(train=train)'''

    def __init__(self, train):
        import pandas as pd
        import numpy as np
        from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
        from sklearn.preprocessing import (OneHotEncoder, FunctionTransformer)
        from sklearn.ensemble import (ExtraTreesRegressor, VotingRegressor, RandomForestRegressor)
        from sklearn.svm import LinearSVR
        from sklearn.pipeline import Pipeline
        from sklearn.compose import ColumnTransformer
        from sklearn.metrics.pairwise import euclidean_distances
        import ast
        import lightgbm as lgb
        for cnt in ['gender', 'ageFrom', 'ageTo', 'income']:
            def x_lab(x):
                return x[cnt]

            train[cnt] = train['targetAudience'].apply(lambda x: x[cnt])
        train['ageFrom,ageTo'] = train.apply(lambda x: [x['ageFrom'], x['ageTo']], axis=1)
        self.gdic = dict(zip(train['gender'].unique(), [['female', 'male'], ['female'], ['male']]))
        locs = pd.Series(pd.Series(train['points'].agg(sum)).astype(str).unique()).apply(lambda x: ast.literal_eval(x))
        self.locs = locs[euclidean_distances(np.array([locs.apply(lambda x: float(x['lat'])).tolist(),
                                                       locs.apply(lambda x: float(x['lon'])).tolist()]).T).mean(
            axis=1) < 2]
        self.train = train
        pipe0 = Pipeline([('input', ColumnTransformer([
            ('oe', OneHotEncoder(handle_unknown='ignore'), ['income']),
            ('gender', CountVectorizer(tokenizer=lambda x: self.gdic[x], token_pattern=None,
                                       lowercase=False), 'gender'),
            ('ageFrom,ageTo',
             CountVectorizer(tokenizer=lambda x: [str(i) for i in range(x[0], x[1] + 1)], token_pattern=None,
                             lowercase=False), 'ageFrom,ageTo'),
            ('coordinates', CountVectorizer(tokenizer=lambda x: [str(i) for i in x], token_pattern=None,
                                            lowercase=False), 'points')
        ])),
                          ('tfidf', TfidfTransformer()),
                          ('cls', LinearSVR(C=10., loss='squared_epsilon_insensitive'))])
        pipe1 = Pipeline([('input', ColumnTransformer([
            ('ageFrom,ageTo',
             CountVectorizer(tokenizer=lambda x: [str(i) for i in range(x[0], x[1] + 1)], token_pattern=None,
                             lowercase=False), 'ageFrom,ageTo'),
            ('gender', CountVectorizer(tokenizer=lambda x: self.gdic[x], token_pattern=None,
                                       lowercase=False), 'gender'),
            ('income', CountVectorizer(analyzer='char',
                                       lowercase=False), 'income'),
            ('coordinates',
             CountVectorizer(tokenizer=lambda x: [str({i[j] for j in ['lat', 'lon']}) for i in x], token_pattern=None,
                             lowercase=False), 'points')

        ])),
                          ('cls', RandomForestRegressor(n_estimators=100))])
        pipe2 = Pipeline([('input', ColumnTransformer([

            ('passthrough', FunctionTransformer(lambda x: x.astype(float)), ['ageFrom', 'ageTo']),
            ('gender', CountVectorizer(tokenizer=lambda x: self.gdic[x], token_pattern=None,
                                       lowercase=False), 'gender'),
            ('income', CountVectorizer(analyzer='char',
                                       lowercase=False), 'income'),
            ('coordinates',
             CountVectorizer(tokenizer=lambda x: [str({i[j] for j in ['lat', 'lon']}) for i in x], token_pattern=None,
                             lowercase=False), 'points')
        ])),
                          ('cls', lgb.sklearn.LGBMRegressor(n_estimators=250))])
        pipe3 = Pipeline([('input', ColumnTransformer([
            ('ageFrom,ageTo',
             CountVectorizer(tokenizer=lambda x: [str(i) for i in range(x[0], x[1] + 1)], token_pattern=None,
                             lowercase=False), 'ageFrom,ageTo'),
            ('gender', CountVectorizer(tokenizer=lambda x: self.gdic[x], token_pattern=None,
                                       lowercase=False), 'gender'),
            ('income', CountVectorizer(analyzer='char',
                                       lowercase=False), 'income'),
            ('coordinates',
             CountVectorizer(tokenizer=lambda x: [str({i[j] for j in ['lat', 'lon']}) for i in x], token_pattern=None,
                             lowercase=False), 'points')

        ])),
                          ('cls', ExtraTreesRegressor(n_estimators=100))])
        self.vr = VotingRegressor([('0', pipe0), ('1', pipe1), ('2', pipe2), ('3', pipe3)]).fit(train, train['value'])

    def get_den(self, aud):
        '''Построить датафрэйм со всеми выбранными точками.'''
        den = pd.DataFrame({'points': self.locs.apply(lambda x: [x]).tolist()})
        den['targetAudience'] = [aud] * len(den)
        for cnt in ['gender', 'ageFrom', 'ageTo', 'income']:
            den[cnt] = den['targetAudience'].apply(lambda x: x[cnt])
        den['ageFrom,ageTo'] = den.apply(lambda x: [x['ageFrom'], x['ageTo']], axis=1)
        return den

    def predict_points(self, aud, points=None, sorted=True, top_k=None, score=False):
        '''Предсказать точки размещения для заданной целевой аудитории, параметры:
        aud - целевая аудитория в виде словаря
        {'name': 'All 18+', 'gender': 'female', 'ageFrom': 18, 'ageTo': 100, 'income': 'abc'};
        points - подмножество точек в виде серии словарей
        0      {'lat': '55.573691', 'lon': '37.631423', 'azim...
        1      {'lat': '55.584765', 'lon': '37.712454', 'azim...;
        sorted - сортировать;
        top_k - выбрать лучшие k точек;
        score - добавить скор для суммы полученных точек
        (в виде кортежа с датафрэймом точек).'''
        if points is None:
            den = pd.DataFrame({'points': self.locs.apply(lambda x: [x]).tolist()})
        else:
            den = pd.DataFrame({'points': points.apply(lambda x: [x]).tolist()})
        den['targetAudience'] = [aud] * len(den)
        for cnt in ['gender', 'ageFrom', 'ageTo', 'income']:
            den[cnt] = den['targetAudience'].apply(lambda x: x[cnt])
        den['ageFrom,ageTo'] = den.apply(lambda x: [x['ageFrom'], x['ageTo']], axis=1)
        weights = self.vr.predict(den)
        out = pd.DataFrame({'lat': den['points'].apply(lambda x: float(x[0]['lat']))[weights > 0].values,
                            'lon': den['points'].apply(lambda x: float(x[0]['lon']))[weights > 0].values,
                            'weights': weights[weights > 0]})
        if points is None:
            out['weights'] = 100 * out['weights'] / out['weights'].sum()
        else:
            out['weights'] = out['weights'] / max(100, out['weights'].sum())
        if sorted:
            out = out.sort_values(by='weights', ascending=False)
        if top_k is not None:
            out = out.iloc[:top_k]
            den2 = den.iloc[:1]
            den2['points'] = [den['points'].apply(lambda x: x[0]).loc[out.index].tolist()]
            score_ = min(100., self.vr.predict(den2)[0])
            out['weights'] = score_ * out['weights'] / out['weights'].sum()
        if (score) and (top_k is not None):
            out = (out, score_)
        return out

    def predict(self, test):
        '''Сделать предсказание на тестовом
        наборе данных.'''
        for cnt in ['gender', 'ageFrom', 'ageTo', 'income']:
            test[cnt] = test['targetAudience'].apply(lambda x: x[cnt])
        test['ageFrom,ageTo'] = test.apply(lambda x: [x['ageFrom'], x['ageTo']], axis=1)
        return self.vr.predict(test)

# Примеры

In [None]:
dr = DensityRegressor(train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3564
[LightGBM] [Info] Number of data points in the train set: 1547, number of used features: 1634
[LightGBM] [Info] Start training from score 22.755346


In [None]:
_ = dill.dumps(dr)
dump(_, 'dr.joblib')

['dr.joblib']

In [None]:
dr2 = dill.loads(load('dr.joblib'))

In [None]:
#несортированные точки по подмножеству
points = locs[:1000]
points

0      {'lat': '55.573691', 'lon': '37.631423', 'azim...
1      {'lat': '55.584765', 'lon': '37.712454', 'azim...
2      {'lat': '55.808425457052', 'lon': '37.38880796...
3      {'lat': '55.674378', 'lon': '37.422364', 'azim...
4      {'lat': '55.608396', 'lon': '37.766383', 'azim...
                             ...                        
995    {'lat': '55.688370564869', 'lon': '37.48246445...
996    {'lat': '55.671153313516', 'lon': '37.45158255...
997    {'lat': '55.773404972642', 'lon': '37.48560905...
998    {'lat': '55.663751788509', 'lon': '37.61293969...
999    {'lat': '55.734352002225', 'lon': '37.70921677...
Length: 1000, dtype: object

In [None]:
dr2.predict_points({'name': 'All 18+', 'gender': 'male', 'ageFrom': 18, 'ageTo': 70, 'income': 'abc'},
                   points=points,
                   sorted=False)

Unnamed: 0,lat,lon,weights
0,55.573691,37.631423,0.000911
1,55.584765,37.712454,0.002158
2,55.808425,37.388808,0.001079
3,55.674378,37.422364,0.004385
4,55.608396,37.766383,0.002916
...,...,...,...
841,55.785316,37.635409,0.000843
842,55.688371,37.482464,0.001648
843,55.671153,37.451583,0.000682
844,55.773405,37.485609,0.001264


In [None]:
#несортированные точки
dr.predict_points({'name': 'All 18+', 'gender': 'female', 'ageFrom': 18, 'ageTo': 100, 'income': 'abc'},
                  sorted=False)

Unnamed: 0,lat,lon,weights
0,55.674378,37.422364,0.176785
1,55.608396,37.766383,0.018359
2,55.710000,37.387500,2.112895
3,55.827620,37.832285,0.281103
4,55.796140,37.377824,0.751069
...,...,...,...
296,55.623300,37.477410,0.067766
297,55.631418,37.439089,0.487570
298,55.816474,37.349728,0.168820
299,55.811540,37.027981,0.711226


In [None]:
#сортированные точки
dr2.predict_points({'name': 'All 18+', 'gender': 'all', 'ageFrom': 18, 'ageTo': 100, 'income': 'abc'})

Unnamed: 0,lat,lon,weights
1735,55.638619,37.826126,1.727899
2520,55.638619,37.826126,1.718145
1562,55.712448,37.382685,0.822824
2093,55.637389,37.693260,0.695785
6,55.710000,37.387500,0.615598
...,...,...,...
125,55.747308,37.691356,0.000056
1008,55.832077,37.653998,0.000037
1048,55.649784,37.661985,0.000037
936,55.746143,37.600418,0.000010


In [None]:
#top_k точки
dr2.predict_points({'name': 'All 18+', 'gender': 'all', 'ageFrom': 18, 'ageTo': 100, 'income': 'abc'},
                   top_k=50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den2['points']=[den['points'].apply(lambda x: x[0]).loc[out.index].tolist()]


Unnamed: 0,lat,lon,weights
1735,55.638619,37.826126,1.465261
2520,55.638619,37.826126,1.45699
1562,55.712448,37.382685,0.697756
2093,55.637389,37.69326,0.590027
6,55.71,37.3875,0.522028
109,55.711028,37.621656,0.501931
477,55.711028,37.621656,0.495968
1206,55.71,37.3875,0.494877
824,55.866624,37.703882,0.482547
209,55.866624,37.703882,0.476563


In [None]:
#top_k точки и ожидаемый охват
dr2.predict_points({'name': 'All 18+', 'gender': 'all', 'ageFrom': 18, 'ageTo': 100, 'income': 'abc'},
                   top_k=50, score=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den2['points']=[den['points'].apply(lambda x: x[0]).loc[out.index].tolist()]


(            lat        lon   weights
 1735  55.638619  37.826126  1.465261
 2520  55.638619  37.826126  1.456990
 1562  55.712448  37.382685  0.697756
 2093  55.637389  37.693260  0.590027
 6     55.710000  37.387500  0.522028
 109   55.711028  37.621656  0.501931
 477   55.711028  37.621656  0.495968
 1206  55.710000  37.387500  0.494877
 824   55.866624  37.703882  0.482547
 209   55.866624  37.703882  0.476563
 2393  55.752910  37.575536  0.363986
 1297  55.849420  37.566226  0.338057
 280   55.772184  37.420297  0.337726
 2073  55.636359  37.620593  0.327770
 2311  55.772184  37.420297  0.321577
 1511  55.775163  37.583671  0.321051
 1404  55.571499  37.666427  0.318290
 912   55.740009  37.653849  0.308601
 1419  55.884140  37.443990  0.302727
 1076  55.733633  37.542920  0.292545
 1403  55.571499  37.666427  0.278726
 2202  55.571499  37.666427  0.278146
 1941  55.676713  37.625299  0.277790
 3045  55.741589  37.429402  0.270247
 1678  55.784577  37.876483  0.266884
 1155  55.75

In [None]:
#предсказание на тесте
dr2.predict(train)

array([23.83286964,  0.73901049,  4.06723124, ..., 38.82096832,
       53.63247591, 53.63247591])

# Метрики

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, ShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit, \
    GridSearchCV, KFold, StratifiedKFold

for cnt in ['gender', 'ageFrom', 'ageTo', 'income']:
    train[cnt] = train['targetAudience'].apply(lambda x: x[cnt])
train['ageFrom,ageTo'] = train.apply(lambda x: [x['ageFrom'], x['ageTo']], axis=1)
gdic = dict(zip(train['gender'].unique(), [['female', 'male'], ['female'], ['male']]))

In [None]:
est = cross_val_score(dr.vr,
                      train,
                      train['value'], scoring='r2',
                      cv=KFold(shuffle=True), verbose=3)
print(est.mean())
print(est.std())

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2542
[LightGBM] [Info] Number of data points in the train set: 1237, number of used features: 1149
[LightGBM] [Info] Start training from score 22.870744
[CV] END ................................ score: (test=0.872) total time= 1.9min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2825
[LightGBM] [Info] Number of data points in the train set: 1237, number of used features: 1285
[LightGBM] [Info] Start training from score 22.750647
[CV] END ................................ score: (test=0.874) total time= 1.7min
[LightGBM] [Info] Auto

In [None]:
est = cross_val_score(dr.vr,
                      train,
                      train['value'], scoring='neg_root_mean_squared_error',
                      cv=KFold(shuffle=True), verbose=3)
print(est.mean())
print(est.std())

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2709
[LightGBM] [Info] Number of data points in the train set: 1237, number of used features: 1241
[LightGBM] [Info] Start training from score 22.726492
[CV] END ............................... score: (test=-8.225) total time= 1.7min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2584
[LightGBM] [Info] Number of data points in the train set: 1237, number of used features: 1172
[LightGBM] [Info] Start training from score 22.626370
[CV] END ............................... score: (test=-7.555) total time= 1.6min
[LightGBM] [Info] Auto