In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

In [2]:
DATA_PATH = '../data/'
POSTPROCESSING = 'postprocessing/'

Работа с жанрами контента

In [3]:
content = pd.read_csv(f'{DATA_PATH}{POSTPROCESSING}content.csv',
                      index_col='content_uid')

In [4]:
content.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3829 entries, 9882 to 10190
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3829 non-null   object 
 1   type              3829 non-null   object 
 2   serial_id         0 non-null      float64
 3   genres            3810 non-null   object 
 4   duration_seconds  3829 non-null   float64
dtypes: float64(2), object(3)
memory usage: 179.5+ KB


Заполение пропусков

In [5]:
content['genres'] = content['genres'].fillna('отсутствует')

In [6]:
print(f"Изначальное количесво уникальных жанров{len(content['genres'].unique())}")

Изначальное количесво уникальных жанров519


Работа с жанрами
* Избавление от пунктуации
* Избавление от стоп-слов
* Морфолизация

In [7]:
import string
import pymorphy2
from nltk.stem import SnowballStemmer

morph = pymorphy2.MorphAnalyzer()
snowball = SnowballStemmer(language="russian")

In [8]:
import nltk
nltk.download('punkt')
from nltk.stem.snowball import RussianStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Алексей\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
stemmer = RussianStemmer(False)

In [10]:
content['genres'] = content['genres'].apply(
    lambda x: [morph.parse(word.lower())[0].normal_form
               for word in nltk.word_tokenize(x) if (word not in string.punctuation) and
                    (word not in nltk.corpus.stopwords.words('russian'))]
)


In [11]:
unique = []
for item in content.genres:
    for word in item:
        if word not in unique:
            unique.append(word)

In [12]:
print(f'Финальное количесво жанров{len(unique)}')

Финальное количесво жанров71


In [13]:
content.genres

content_uid
9882      [игровой, шоу, музыка]
19870                  [комедия]
2312        [мелодрама, комедия]
12655           [драма, комедия]
25771        [комедийный, драма]
                  ...           
20786              [реалити-шоу]
4037     [биографический, драма]
10201            [сериал, драма]
27405             [исторический]
10190                [мелодрама]
Name: genres, Length: 3829, dtype: object

Создание новой таблицы - принадлежности контента к каждому из жанров

In [14]:
film_data = pd.DataFrame(columns=unique, index=content.index)
film_data.fillna(0, inplace=True)

for _, film in content.iterrows():
    if film.genres != 'отсутствовать':
        for genre in film.genres:
            film_data.loc[film.name, genre] = 1


In [15]:
film_data.to_csv(f'{DATA_PATH}{POSTPROCESSING}film_genres.csv')

Созднание новой таблицы - сколько посмотрел каждый пользователей по типам контента

In [16]:
history = pd.read_csv(f'{DATA_PATH}{POSTPROCESSING}watch_history.csv', index_col=0)
users_types = pd.DataFrame(columns=[content.type.unique()], index=history.user_uid.unique()).fillna(0)


In [17]:
history['type'] = content.loc[history.content_uid.values]['type'].values

In [18]:
type_history = history[['type', 'second','user_uid']].groupby(by=['user_uid', 'type']).sum().reset_index()
type_history

Unnamed: 0,user_uid,type,second
0,1,movie,6720
1,4,movie,23638
2,4,serial_with_season,30169
3,25,movie,23969
4,25,serial_with_season,665
...,...,...,...
265196,1302880,movie,26
265197,1302881,movie,87822
265198,1302912,movie,16
265199,1302913,movie,118


In [19]:
users_types

Unnamed: 0,serial_with_season,movie,serial_without_season
1,0,0,0
4,0,0,0
25,0,0,0
43,0,0,0
45,0,0,0
...,...,...,...
1302880,0,0,0
1302881,0,0,0
1302912,0,0,0
1302913,0,0,0


In [20]:
for _, row in type_history.iterrows():

    users_types.loc[row.user_uid][row.type] = row.second

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_types.loc[row.user_uid][row.type] = row.second
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc[key] = value


In [21]:
users_types.to_csv(f'{DATA_PATH}{POSTPROCESSING}user_content_types.csv')

Создание новой таблицы - сколько посмотрел пользователь по жанрам

In [22]:
users_genres = pd.merge(history, film_data, on='content_uid').drop(columns=['content_uid', 'type'])
users_genres

Unnamed: 0,user_uid,second,игровой,шоу,музыка,комедия,мелодрама,драма,комедийный,фэнтезийный,...,молодёжь,детский,ситком,индонезия,психологический,увлечение,досуг,концерт,скетчкома,интервью
0,1,627,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,481,8129,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,577,65,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1780,429,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2271,152,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981247,1270616,19,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
981248,1276791,3,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
981249,1295114,310,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
981250,1299863,511,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
users_genres.iloc[:, 2:] = users_genres.iloc[:, 2:].apply(lambda x: x * users_genres.second)

In [24]:
users_genres = users_genres.drop(columns='second').groupby(by='user_uid').sum()

In [25]:
users_genres.to_csv(f'{DATA_PATH}{POSTPROCESSING}users_genres.csv')
