In [42]:
import pandas as pd
import numpy as np

In [43]:
DATA_PATH = '../data/'
POSTPROCESSING = 'postprocessing/'

In [44]:
content = pd.read_csv(f'{DATA_PATH}{POSTPROCESSING}content.csv',
                      index_col='content_uid')

In [45]:
content.loc[10201]

name                 Ярмарка тщеславия
type                serial_with_season
serial_id                          NaN
genres                    Сериал,Драма
duration_seconds                 19704
Name: 10201, dtype: object

In [46]:
content.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3829 entries, 9882 to 10190
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3829 non-null   object 
 1   type              3829 non-null   object 
 2   serial_id         0 non-null      float64
 3   genres            3810 non-null   object 
 4   duration_seconds  3829 non-null   float64
dtypes: float64(2), object(3)
memory usage: 339.5+ KB


In [47]:
content['genres'] = content['genres'].fillna('отсутствует')

In [48]:
len(content['genres'].unique())

519

In [49]:
import string
import pymorphy2
from nltk.stem import SnowballStemmer

morph = pymorphy2.MorphAnalyzer()
snowball = SnowballStemmer(language="russian")

In [50]:
import nltk
nltk.download('punkt')
from nltk.stem.snowball import RussianStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Алексей\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [51]:
stemmer = RussianStemmer(False)

In [52]:
content['genres'] = content['genres'].apply(
    lambda x: [morph.parse(word.lower())[0].normal_form
               for word in nltk.word_tokenize(x) if (word not in string.punctuation) and
                    (word not in nltk.corpus.stopwords.words('russian'))]
)


In [53]:
all_genres = dict()
unique = []
for item in content.genres:
    if 'и' in item:
        print(item)
    for word in item:
        if word not in unique:
            unique.append(word)
        if word in all_genres.keys():
            all_genres[word] += 1
        else:
            all_genres[word] = 1

In [54]:
len(unique)

71

In [55]:
all_genres

{'игровой': 4,
 'шоу': 166,
 'музыка': 16,
 'комедия': 856,
 'мелодрама': 495,
 'драма': 1398,
 'комедийный': 113,
 'фэнтезийный': 5,
 'боевик': 470,
 'приключение': 247,
 'криминальный': 122,
 'детектив': 230,
 'триллер': 485,
 'военный': 153,
 'криминал': 185,
 'ужас': 166,
 'фантастика': 167,
 'биографический': 34,
 'фэнтези': 171,
 'мюзикл': 38,
 'хоррор': 7,
 'документальный': 71,
 'биография': 67,
 'отсутствовать': 19,
 'юмор': 37,
 'стендап': 24,
 'фантастический': 70,
 'мультипликационный': 107,
 'фильм': 158,
 'спорт': 39,
 'мистика': 28,
 'исторический': 22,
 'кулинария': 7,
 'реалити-шоу': 43,
 'мультфильм': 97,
 'история': 30,
 'испания': 16,
 'приключенческий': 23,
 'ребёнок': 19,
 'семейный': 107,
 'трагикомедия': 24,
 'ток-шоу': 7,
 'вестерн': 19,
 'мультсериал': 61,
 'романтический': 13,
 'для': 3,
 'взрослый': 3,
 'анимация': 11,
 'спортивный': 16,
 'мистический': 5,
 'путешествие': 4,
 'сказка': 18,
 'развлекательный': 13,
 'программа': 16,
 'научный': 1,
 'викторин':

In [56]:
content.genres

content_uid
9882      [игровой, шоу, музыка]
19870                  [комедия]
2312        [мелодрама, комедия]
12655           [драма, комедия]
25771        [комедийный, драма]
                  ...           
20786              [реалити-шоу]
4037     [биографический, драма]
10201            [сериал, драма]
27405             [исторический]
10190                [мелодрама]
Name: genres, Length: 3829, dtype: object

In [57]:
path = '../navec_hudlit_v1_12B_500K_300d_100q.tar'
from navec import Navec

navec = Navec.load(path)

In [58]:
a = navec['для']
b = navec['комедия']

In [59]:
from scipy.spatial.distance import cosine

cosine(a, b)

0.9862090367823839

In [60]:
for item in unique:
    for item2 in unique:
        try:
            if item != item2:
                if 0.65 > cosine(navec[item], navec[item2]) > 0:
                    #print(f'{item} - {item2}')
                    pass
        except KeyError:
            #print(f'error - {item} - {item2}')
            pass



In [61]:
film_data = pd.DataFrame(columns=unique, index=content.index)
film_data.fillna(0, inplace=True)

for _, film in content.iterrows():
    if film.genres != 'отсутствовать':
        for genre in film.genres:
            film_data.loc[film.name, genre] = 1


In [None]:
film_data.to_csv(f'{DATA_PATH}{POSTPROCESSING}film_genres.csv')

In [62]:
history = pd.read_csv(f'{DATA_PATH}{POSTPROCESSING}watch_history.csv', index_col=0)
users_types = pd.DataFrame(columns=[content.type.unique()], index=history.user_uid.unique()).fillna(0)

In [63]:
history

Unnamed: 0,user_uid,content_uid,second
0,1,11882.0,627
1,1,26174.0,5693
2,1,26719.0,400
3,4,2829.0,12
4,4,3836.0,64
...,...,...,...
981247,1302881,29054.0,476
981248,1302912,25224.0,16
981249,1302913,27762.0,118
981250,1302914,24797.0,26


In [64]:
history['type'] = content.loc[history.content_uid.values]['type'].values

In [65]:
type_history = history[['type', 'second','user_uid']].groupby(by=['user_uid', 'type']).sum().reset_index()
type_history

Unnamed: 0,user_uid,type,second
0,1,movie,6720
1,4,movie,23638
2,4,serial_with_season,30169
3,25,movie,23969
4,25,serial_with_season,665
...,...,...,...
265196,1302880,movie,26
265197,1302881,movie,87822
265198,1302912,movie,16
265199,1302913,movie,118


In [66]:
users_types

Unnamed: 0,serial_with_season,movie,serial_without_season
1,0,0,0
4,0,0,0
25,0,0,0
43,0,0,0
45,0,0,0
...,...,...,...
1302880,0,0,0
1302881,0,0,0
1302912,0,0,0
1302913,0,0,0


In [67]:
for _, row in type_history.iterrows():

    users_types.loc[row.user_uid][row.type] = row.second

In [None]:
users_types.to_csv(f'{DATA_PATH}{POSTPROCESSING}user_content_types.csv')

In [68]:
film_data

Unnamed: 0_level_0,игровой,шоу,музыка,комедия,мелодрама,драма,комедийный,фэнтезийный,боевик,приключение,...,молодёжь,детский,ситком,индонезия,психологический,увлечение,досуг,концерт,скетчкома,интервью
content_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9882,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19870,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2312,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12655,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25771,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4037,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10201,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27405,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
users_genres = pd.merge(history, film_data, on='content_uid').drop(columns=['content_uid', 'type'])
users_genres

Unnamed: 0,user_uid,second,игровой,шоу,музыка,комедия,мелодрама,драма,комедийный,фэнтезийный,...,молодёжь,детский,ситком,индонезия,психологический,увлечение,досуг,концерт,скетчкома,интервью
0,1,627,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,481,8129,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,577,65,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1780,429,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2271,152,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981247,1270616,19,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
981248,1276791,3,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
981249,1295114,310,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
981250,1299863,511,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [70]:
users_genres.iloc[:, 2:] = users_genres.iloc[:, 2:].apply(lambda x: x * users_genres.second)

In [71]:
users_genres = users_genres.drop(columns='second').groupby(by='user_uid').sum()

In [75]:

users_genres.to_csv(f'{DATA_PATH}{POSTPROCESSING}users_genres.csv')

In [72]:
users_genres

Unnamed: 0_level_0,игровой,шоу,музыка,комедия,мелодрама,драма,комедийный,фэнтезийный,боевик,приключение,...,молодёжь,детский,ситком,индонезия,психологический,увлечение,досуг,концерт,скетчкома,интервью
user_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,400,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,27683,12981,12324,5154,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,900,6920,10666,1559,0,14166,0,...,0,0,0,0,0,0,0,0,0,0
43,0,0,0,599,61,61,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45,0,0,0,1470,1938,4257,293,0,6029,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1302880,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1302881,0,0,0,25366,13850,39685,7374,0,4852,804,...,0,0,0,0,0,0,0,0,0,0
1302912,0,0,0,0,0,16,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1302913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
users_types

Unnamed: 0,serial_with_season,movie,serial_without_season
1,0,6720,0
4,30169,23638,0
25,665,23969,0
43,0,660,0
45,0,14370,0
...,...,...,...
1302880,0,26,0
1302881,0,87822,0
1302912,0,16,0
1302913,0,118,0


In [74]:
film_data

Unnamed: 0_level_0,игровой,шоу,музыка,комедия,мелодрама,драма,комедийный,фэнтезийный,боевик,приключение,...,молодёжь,детский,ситком,индонезия,психологический,увлечение,досуг,концерт,скетчкома,интервью
content_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9882,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19870,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2312,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12655,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25771,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4037,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10201,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27405,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
