In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import src.feature_preprocess as feature_preprocess
from sklearn.decomposition import LatentDirichletAllocation as LDA
import pyLDAvis
import pyLDAvis.sklearn



# Clasterisation

In [2]:
train = pd.read_csv("data/train.csv")
print(train.shape)

(44854516, 10)


У нас очень много строчек - LDA будет работать слишком долго, поэтому оставим только самые лучшие строчки(по сути найденные кластеры должны быть глобальными,поэтому выкидывание строчек не должно быть проблемой)

Только bundle id андроид имеют говорящие названия - поэтому оставим только их

In [3]:
train['os'] = train['os'].str.upper()
train = train[train['os'] == 'ANDROID']

Удалим строчки с пропусками и дубли.

In [None]:
train = train.dropna()
train = train.drop_duplicates() 
train.shape

(20918176, 10)

Все еще слишком много - возьмем подмножество строчек.

In [None]:
train = train.sample(1000000)

In [None]:
train.to_csv('data/train_na.csv')

In [2]:
train = pd.read_csv('data/train_na.csv')

Теперь добавим интерпретируемых тэгов

In [3]:
train = train.reset_index(drop=True)

time_features = feature_preprocess.make_features_from_time(train)
time_tags = feature_preprocess.get_tags_from_time_features(time_features)
time_tags = time_tags.rename(columns={'loc_weekday_tag': 'weekday', 'loc_is_weekend_tag': 'weekend', 'loc_is_academic_year_tag': 'учебный год', 'loc_time_of_day_tag': 'время'})
time_tags = time_tags.drop(columns='weekday')
print('time_tags finished')

city_features = feature_preprocess.make_features_from_cities(train, "data/data_cities.csv")
city_tags = feature_preprocess.get_tags_from_cities_features(city_features)
print('city_tags finished')

phone_tags = feature_preprocess.phone_tags(train)
print('phone_tags finished')

train = pd.concat([train, time_tags, city_tags, phone_tags], axis=1)

train = train.rename(columns={'gamecategory': 'Кат', 'subgamecategory': 'ПодКат'})

train = train.dropna()

time_tags finished
city_tags finished


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out['os'] = out['os'].str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out['osv_num'] = data['osv'].astype(str).apply(get_version_float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out['new_phone'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

phone_tags finished


Теперь соберем бандл и тэги в description

In [4]:
description = feature_preprocess.create_description(train, ['bundle', 'Кат', 'ПодКат'] + list(time_tags.columns) + list(city_tags.columns) + list(phone_tags.columns))
description = description.drop_duplicates()
description

Кат
(807608,)
ПодКат
(807608,)
weekend
(807608,)
учебный год
(807608,)
время
(807608,)
type_city
(807608,)
size_city
(807608,)
timezone
(807608,)
os_
(807608,)
new_phone
(807608,)


1         com.supercell.hayday.tag: Games tag_t: Кат.tag...
2         es.socialpoint.MonsterLegends.tag: Games tag_t...
3         me.zepeto.main.tag: Applications tag_t: Кат.ta...
4         com.merge.cube.winner.tag: Games tag_t: Кат.ta...
5         com.inspiredsquare.blocks.tag: Games tag_t: Ка...
                                ...                        
999957    com.bigcool.puzzle.candygenies.tag: Games tag_...
999966    com.feofun.elementalmaster.tag: Games tag_t: К...
999979    sort.water.puzzle.pour.color.tubes.sorting.gam...
999984    com.dunderbit.humbug.tag: Games tag_t: Кат.tag...
999989    com.infraware.office.link.tag: Applications ta...
Length: 290133, dtype: object

In [5]:
count_vect = CountVectorizer(tokenizer=lambda x: x.split('.'),
                             strip_accents = 'unicode',
                             lowercase = True)

alpha = 1 
beta = 0.05

lda = LDA(n_components=7,
        #   learning_method='online',
          doc_topic_prior = alpha,
          topic_word_prior = beta,
          n_jobs=-1,
          random_state=12345,
          verbose=True)

matrix_count_vec = count_vect.fit_transform(description.astype(str))

lda.fit(matrix_count_vec)

pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, matrix_count_vec, count_vect)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


Посмотрим на покрытие