In [3]:
import pandas as pd
import numpy as np


### We have three tables with data about posts, users and their actions. 

In [4]:
users_data = pd.read_csv('users', sep=";")

In [5]:
posts_text = pd.read_csv('posts', sep=";")

In [6]:
feed_data = pd.read_csv('feeds_data_5million', sep=";")

Join tables Users and feed_data

In [7]:
action_users = feed_data.merge(users_data,left_on="user_id", right_on="user_id")

In [8]:
action_users.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source
0,2021-12-29 23:51:06,147682,6676,view,0,0,24,Russia,Trubchevsk,1,Android,organic
1,2021-12-29 23:48:26,147682,5180,view,0,0,24,Russia,Trubchevsk,1,Android,organic
2,2021-12-29 23:45:27,147682,5967,view,0,0,24,Russia,Trubchevsk,1,Android,organic
3,2021-12-29 23:43:55,147682,1983,view,0,0,24,Russia,Trubchevsk,1,Android,organic
4,2021-12-29 23:43:08,147682,6699,view,0,0,24,Russia,Trubchevsk,1,Android,organic


### features by users

#### feature 1 


We group according to our choice:
* if the post has been viewed - set the view
* if the post has goal 1 - like it
* find the ratio of likes to views

In [45]:
ratio_of_likes_posts = action_users \
    .groupby('user_id', as_index=False) \
    .agg({'timestamp':'count', 'target':'sum'}) \
    .rename(columns={'timestamp':'views', 'target':'likes'})
ratio_of_likes_posts['ratio of likes posts'] = ratio_of_likes_posts.likes / ratio_of_likes_posts.views
ratio_of_likes_posts = ratio_of_likes_posts.drop(columns =['likes']) #will not use 'like' in the future

In [46]:
ratio_of_likes_posts.head()

Unnamed: 0,user_id,views,ratio of likes posts
0,200,51,0.117647
1,201,26,0.153846
2,202,70,0.114286
3,204,28,0.178571
4,205,57,0.070175


#### feature 2
ratio of likes to topic

We group according to our choice:
* if the topic has been viewed - set the view
* if the topic has goal 1 - like it
* find the ratio of likes to views for topic

In [47]:
ratio_of_likes_topic = action_users \
    .merge(posts_text[['post_id', 'topic']], on='post_id', how='inner') \
    .groupby(['user_id','topic'], as_index=False) \
    .agg({'timestamp':'count', 'target':'sum'}) \
    .rename(columns={'timestamp':'views', 'target':'likes'})
ratio_of_likes_topic['ratio of likes topic'] = ratio_of_likes_topic.likes / ratio_of_likes_topic.views

In [48]:
ratio_of_likes_topic

Unnamed: 0,user_id,topic,views,likes,ratio of likes topic
0,200,business,3,0,0.000000
1,200,covid,16,3,0.187500
2,200,movie,14,1,0.071429
3,200,politics,6,1,0.166667
4,200,sport,8,0,0.000000
...,...,...,...,...,...
698645,168551,entertainment,5,2,0.400000
698646,168551,movie,8,1,0.125000
698647,168551,politics,2,0,0.000000
698648,168551,sport,6,0,0.000000


Let's see the share of likes for each topic for each user

In [50]:
user_topic_likes_share = ratio_of_likes_topic \
    .pivot(columns='topic', index='user_id', values='ratio of likes topic') \
    .reset_index() \
    .rename_axis(None, axis=1)

In [51]:
user_topic_likes_share

Unnamed: 0,user_id,business,covid,entertainment,movie,politics,sport,tech
0,200,0.000000,0.187500,,0.071429,0.166667,0.000000,0.25
1,201,,0.000000,0.00,0.333333,0.000000,0.000000,0.00
2,202,0.000000,0.080000,0.00,0.166667,0.000000,0.166667,0.00
3,204,0.000000,0.166667,,0.133333,1.000000,0.250000,
4,205,0.333333,0.050000,0.00,0.200000,0.000000,0.000000,0.00
...,...,...,...,...,...,...,...,...
108048,168544,0.000000,0.100000,0.00,0.142857,0.000000,0.500000,0.00
108049,168545,0.000000,0.000000,0.25,0.083333,,0.333333,0.00
108050,168547,0.000000,0.043478,,0.090909,0.200000,0.000000,0.00
108051,168549,0.000000,0.000000,0.00,0.100000,0.000000,0.000000,0.00


In [52]:
#join table users_data to user_topic_likes_share
user_data_new = users_data.merge(user_topic_likes_share, how='inner', on='user_id')

In [53]:
#Nan to 0
user_data_new = user_data_new.fillna(0.0)

In [54]:
#add table ratio_of_likes_posts
user_data_new = user_data_new.merge(ratio_of_likes_posts, on='user_id', how='inner')

Delete feature exp_group beacue this feature we dont need in this moment 

In [55]:
user_data_feature = user_data_new.drop(columns=['exp_group'])

In [56]:
user_data_feature

Unnamed: 0,user_id,gender,age,country,city,os,source,business,covid,entertainment,movie,politics,sport,tech,views,ratio of likes posts
0,168551,0,38,Russia,Moscow,iOS,organic,0.000000,0.250000,0.40,0.125000,0.000000,0.000000,0.00,30,0.166667
1,168549,0,18,Russia,Tula,Android,organic,0.000000,0.000000,0.00,0.100000,0.000000,0.000000,0.00,34,0.029412
2,168547,1,21,Russia,Magnitogorsk,Android,organic,0.000000,0.043478,0.00,0.090909,0.200000,0.000000,0.00,65,0.061538
3,168545,1,25,Russia,Berezniki,iOS,organic,0.000000,0.000000,0.25,0.083333,0.000000,0.333333,0.00,36,0.083333
4,168544,1,18,Ukraine,Odesa,iOS,organic,0.000000,0.100000,0.00,0.142857,0.000000,0.500000,0.00,27,0.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108048,205,0,32,Russia,Dugulubgey,Android,ads,0.333333,0.050000,0.00,0.200000,0.000000,0.000000,0.00,57,0.070175
108049,204,0,36,Russia,Anzhero-Sudzhensk,Android,ads,0.000000,0.166667,0.00,0.133333,1.000000,0.250000,0.00,28,0.178571
108050,202,1,17,Russia,Smolensk,Android,ads,0.000000,0.080000,0.00,0.166667,0.000000,0.166667,0.00,70,0.114286
108051,201,0,37,Russia,Abakan,Android,ads,0.000000,0.000000,0.00,0.333333,0.000000,0.000000,0.00,26,0.153846



We generate features for users not included in the upload


In [57]:
action_users.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source
0,2021-12-29 23:51:06,147682,6676,view,0,0,24,Russia,Trubchevsk,1,Android,organic
1,2021-12-29 23:48:26,147682,5180,view,0,0,24,Russia,Trubchevsk,1,Android,organic
2,2021-12-29 23:45:27,147682,5967,view,0,0,24,Russia,Trubchevsk,1,Android,organic
3,2021-12-29 23:43:55,147682,1983,view,0,0,24,Russia,Trubchevsk,1,Android,organic
4,2021-12-29 23:43:08,147682,6699,view,0,0,24,Russia,Trubchevsk,1,Android,organic


In [58]:
user_all_actions_agg = action_users \
    .groupby('user_id', as_index=False) \
    .agg({'timestamp':'count', 'target':'sum'}) \
    .rename(columns={'timestamp':'views', 'target':'likes'})

In [59]:
user_topic_all_actions = action_users \
    .merge(posts_text[['post_id', 'topic']], on='post_id', how='inner') \
    .groupby(['user_id','topic'], as_index=False) \
    .agg({'timestamp':'count', 'target':'sum'}) \
    .rename(columns={'timestamp':'views', 'target':'likes'})
user_topic_all_actions['like_share'] = user_topic_all_actions.likes / user_topic_all_actions.views
user_topic_all_actions.head()

Unnamed: 0,user_id,topic,views,likes,like_share
0,200,business,3,0,0.0
1,200,covid,16,3,0.1875
2,200,movie,14,1,0.071429
3,200,politics,6,1,0.166667
4,200,sport,8,0,0.0


In [60]:
user_topic_likes_share_all = user_topic_all_actions \
    .pivot(columns='topic', index='user_id', values='like_share') \
    .reset_index() \
    .rename_axis(None, axis=1)

In [61]:
user_data_new_all = users_data.merge(user_topic_likes_share_all, how='left', on='user_id')

In [62]:
user_data_new_all

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source,business,covid,entertainment,movie,politics,sport,tech
0,168552,1,16,Russia,Ivanteyevka,4,Android,organic,,,,,,,
1,168551,0,38,Russia,Moscow,3,iOS,organic,,0.250000,0.4,0.125000,0.000000,0.000000,0.00
2,168550,1,41,Russia,Yekaterinburg,4,Android,organic,,,,,,,
3,168549,0,18,Russia,Tula,2,Android,organic,0.0,0.000000,0.0,0.100000,0.000000,0.000000,0.00
4,168548,0,36,Russia,Kaliningrad,4,Android,organic,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163200,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads,0.0,0.166667,,0.133333,1.000000,0.250000,
163201,203,0,18,Russia,Moscow,1,iOS,ads,,,,,,,
163202,202,1,17,Russia,Smolensk,4,Android,ads,0.0,0.080000,0.0,0.166667,0.000000,0.166667,0.00
163203,201,0,37,Russia,Abakan,0,Android,ads,,0.000000,0.0,0.333333,0.000000,0.000000,0.00


In [63]:
user_data_new_all = user_data_new_all.merge(user_all_actions_agg, on='user_id', how='left')

In [64]:
user_data_feature_all = user_data_new_all.drop(columns=['exp_group'])

In [65]:
all_views = action_users.timestamp.count()
all_likes = action_users.target.sum()
avg_views = all_views / action_users.user_id.unique().shape[0]
avg_likes = all_likes / action_users.user_id.unique().shape[0]

In [66]:
avg_views = round(avg_views, 0)
avg_likes = round(avg_likes, 0)

In [67]:
user_data_feature_all

Unnamed: 0,user_id,gender,age,country,city,os,source,business,covid,entertainment,movie,politics,sport,tech,views,likes
0,168552,1,16,Russia,Ivanteyevka,Android,organic,,,,,,,,,
1,168551,0,38,Russia,Moscow,iOS,organic,,0.250000,0.4,0.125000,0.000000,0.000000,0.00,30.0,5.0
2,168550,1,41,Russia,Yekaterinburg,Android,organic,,,,,,,,,
3,168549,0,18,Russia,Tula,Android,organic,0.0,0.000000,0.0,0.100000,0.000000,0.000000,0.00,34.0,1.0
4,168548,0,36,Russia,Kaliningrad,Android,organic,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163200,204,0,36,Russia,Anzhero-Sudzhensk,Android,ads,0.0,0.166667,,0.133333,1.000000,0.250000,,28.0,5.0
163201,203,0,18,Russia,Moscow,iOS,ads,,,,,,,,,
163202,202,1,17,Russia,Smolensk,Android,ads,0.0,0.080000,0.0,0.166667,0.000000,0.166667,0.00,70.0,8.0
163203,201,0,37,Russia,Abakan,Android,ads,,0.000000,0.0,0.333333,0.000000,0.000000,0.00,26.0,4.0


In [68]:
user_data_feature_all['views'] = user_data_feature_all['views'].fillna(avg_views)
user_data_feature_all['likes'] = user_data_feature_all['likes'].fillna(avg_likes)

In [69]:
user_data_feature_all.head()

Unnamed: 0,user_id,gender,age,country,city,os,source,business,covid,entertainment,movie,politics,sport,tech,views,likes
0,168552,1,16,Russia,Ivanteyevka,Android,organic,,,,,,,,46.0,6.0
1,168551,0,38,Russia,Moscow,iOS,organic,,0.25,0.4,0.125,0.0,0.0,0.0,30.0,5.0
2,168550,1,41,Russia,Yekaterinburg,Android,organic,,,,,,,,46.0,6.0
3,168549,0,18,Russia,Tula,Android,organic,0.0,0.0,0.0,0.1,0.0,0.0,0.0,34.0,1.0
4,168548,0,36,Russia,Kaliningrad,Android,organic,,,,,,,,46.0,6.0


In [70]:
topic_all_miss_val = action_users \
    .merge(posts_text[['post_id', 'topic']], on='post_id', how='inner') \
    .groupby(['topic'], as_index=False) \
    .agg({'timestamp':'count', 'target':'sum'}) \
    .rename(columns={'timestamp':'views', 'target':'likes'})

In [71]:
topic_all_miss_val

Unnamed: 0,topic,views,likes
0,business,314482,43643
1,covid,1209057,166904
2,entertainment,252592,35626
3,movie,1849193,258040
4,politics,504728,66611
5,sport,666493,95123
6,tech,203455,23650


In [72]:
topic_all_miss_val['ratio of likes posts'] = topic_all_miss_val.likes / topic_all_miss_val.views

In [73]:
topic_all_miss_val

Unnamed: 0,topic,views,likes,ratio of likes posts
0,business,314482,43643,0.138777
1,covid,1209057,166904,0.138045
2,entertainment,252592,35626,0.141042
3,movie,1849193,258040,0.139542
4,politics,504728,66611,0.131974
5,sport,666493,95123,0.142722
6,tech,203455,23650,0.116242


In [78]:
topic_all_miss_val.query('topic == "business"')['ratio of likes posts'].item()

0.13877741810342087

In [79]:
business_miss_val = topic_all_miss_val.query('topic == "business"')['ratio of likes posts'].item()
covid_miss_val = topic_all_miss_val.query('topic == "covid"')['ratio of likes posts'].item()
entertainment_miss_val = topic_all_miss_val.query('topic == "entertainment"')['ratio of likes posts'].item()
movie_miss_val = topic_all_miss_val.query('topic == "movie"')['ratio of likes posts'].item()
politics_miss_val = topic_all_miss_val.query('topic == "politics"')['ratio of likes posts'].item()
sport_miss_val = topic_all_miss_val.query('topic == "sport"')['ratio of likes posts'].item()
tech_miss_val = topic_all_miss_val.query('topic == "tech"')['ratio of likes posts'].item()

In [80]:
user_data_feature_all['business'] = user_data_feature_all['business'] \
    .fillna(business_miss_val)
user_data_feature_all['covid'] = user_data_feature_all['covid'] \
    .fillna(covid_miss_val)
user_data_feature_all['entertainment'] = user_data_feature_all['entertainment'] \
    .fillna(entertainment_miss_val)
user_data_feature_all['movie'] = user_data_feature_all['movie'] \
    .fillna(movie_miss_val)
user_data_feature_all['politics'] = user_data_feature_all['politics'] \
    .fillna(politics_miss_val)
user_data_feature_all['sport'] = user_data_feature_all['sport'] \
    .fillna(sport_miss_val)
user_data_feature_all['tech'] = user_data_feature_all['tech'] \
    .fillna(tech_miss_val)

In [81]:
user_data_feature_all

Unnamed: 0,user_id,gender,age,country,city,os,source,business,covid,entertainment,movie,politics,sport,tech,views,likes
0,168552,1,16,Russia,Ivanteyevka,Android,organic,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,6.0
1,168551,0,38,Russia,Moscow,iOS,organic,0.138777,0.250000,0.400000,0.125000,0.000000,0.000000,0.000000,30.0,5.0
2,168550,1,41,Russia,Yekaterinburg,Android,organic,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,6.0
3,168549,0,18,Russia,Tula,Android,organic,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.000000,34.0,1.0
4,168548,0,36,Russia,Kaliningrad,Android,organic,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163200,204,0,36,Russia,Anzhero-Sudzhensk,Android,ads,0.000000,0.166667,0.141042,0.133333,1.000000,0.250000,0.116242,28.0,5.0
163201,203,0,18,Russia,Moscow,iOS,ads,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,6.0
163202,202,1,17,Russia,Smolensk,Android,ads,0.000000,0.080000,0.000000,0.166667,0.000000,0.166667,0.000000,70.0,8.0
163203,201,0,37,Russia,Abakan,Android,ads,0.138777,0.000000,0.000000,0.333333,0.000000,0.000000,0.000000,26.0,4.0


In [82]:
user_data_feature_all['ratio of likes posts'] = user_data_feature_all.likes / user_data_feature_all.views
user_data_feature_all = user_data_feature_all.drop(columns=['likes'])

In [83]:
user_feature_df = user_data_feature_all.drop(columns=['gender', 'country', 'os', 'source'])

In [84]:
user_feature_df

Unnamed: 0,user_id,age,city,business,covid,entertainment,movie,politics,sport,tech,views,ratio of likes posts
0,168552,16,Ivanteyevka,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,0.130435
1,168551,38,Moscow,0.138777,0.250000,0.400000,0.125000,0.000000,0.000000,0.000000,30.0,0.166667
2,168550,41,Yekaterinburg,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,0.130435
3,168549,18,Tula,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.000000,34.0,0.029412
4,168548,36,Kaliningrad,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,0.130435
...,...,...,...,...,...,...,...,...,...,...,...,...
163200,204,36,Anzhero-Sudzhensk,0.000000,0.166667,0.141042,0.133333,1.000000,0.250000,0.116242,28.0,0.178571
163201,203,18,Moscow,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,0.130435
163202,202,17,Smolensk,0.000000,0.080000,0.000000,0.166667,0.000000,0.166667,0.000000,70.0,0.114286
163203,201,37,Abakan,0.138777,0.000000,0.000000,0.333333,0.000000,0.000000,0.000000,26.0,0.153846


In [85]:
user_feature_df.user_id.unique().shape

(163205,)

In [86]:
user_feature_df.isna().sum()

user_id                 0
age                     0
city                    0
business                0
covid                   0
entertainment           0
movie                   0
politics                0
sport                   0
tech                    0
views                   0
ratio of likes posts    0
dtype: int64

Save user_feature_df in the postgressql

In [87]:
from sqlalchemy import create_engine

engine = create_engine(
    "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
    "postgres.lab.karpov.courses:6432/startml"
)

user_feature_df.to_sql('v_patrakeev_all_users', con=engine, index=False, if_exists='replace') 

205

### features by posts

Created SQL quaery post_popularity

In [170]:
post_popularity = pd.read_csv("post_popularity",sep=";")

In [171]:
post_popularity.sort_values(by='likes_share', ascending=False)

Unnamed: 0,post_id,likes,posts_views,likes_share
2106,2371,2284,13906,16
2743,3047,2194,13683,16
6311,6777,2759,17179,16
6614,7093,2758,16840,16
4332,4722,2738,16853,16
...,...,...,...,...
243,289,827,12338,6
913,1022,831,12154,6
909,1018,838,12074,6
906,1014,829,12055,6


In [172]:
posts_text['text_length'] = posts_text.text.apply(lambda x: len(x))

In [173]:
post_feature = posts_text \
    .merge(post_popularity, on='post_id', how='left') \
    .drop(columns=['text'])

In [174]:
post_feature = post_feature.fillna(0.0)

In [175]:
post_feature

Unnamed: 0,post_id,topic,text_length,likes,posts_views,likes_share
0,7319,movie,790,720.0,6585.0,10.0
1,7318,movie,728,680.0,6785.0,10.0
2,7317,movie,636,731.0,6803.0,10.0
3,7316,movie,800,677.0,6572.0,10.0
4,7315,movie,803,2619.0,16940.0,15.0
...,...,...,...,...,...,...
7018,5,business,889,1153.0,8583.0,13.0
7019,4,business,1026,1171.0,8158.0,14.0
7020,3,business,3408,1122.0,8412.0,13.0
7021,2,business,2701,637.0,7495.0,8.0


In [176]:
f'{posts_text.post_id.unique().shape[0]} = {post_feature.post_id.unique().shape[0]}'

'7023 = 7023'

Save post_feature in the postgressql

In [177]:
from sqlalchemy import create_engine

engine = create_engine(
    "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
    "postgres.lab.karpov.courses:6432/startml"
)

post_feature.to_sql('v_patrakeev_all_posts', con=engine, index=False, if_exists='replace') 
# записываем таблицу

23


## We have created two dataframes that will be stored in postgressql for accessing them
* features on all user (download to the postgreSQL) - user_feature_df
* features on all posts (download to the postgreSQL) - post_feature
* feature on 
piece of dat user (data for traning) - user_data_feature

In [178]:
user_feature_df

Unnamed: 0,user_id,age,city,business,covid,entertainment,movie,politics,sport,tech,views,ratio_of_likes_posts
0,168552,16,Ivanteyevka,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,0.130435
1,168551,38,Moscow,0.138777,0.250000,0.400000,0.125000,0.000000,0.000000,0.000000,30.0,0.166667
2,168550,41,Yekaterinburg,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,0.130435
3,168549,18,Tula,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000,0.000000,34.0,0.029412
4,168548,36,Kaliningrad,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,0.130435
...,...,...,...,...,...,...,...,...,...,...,...,...
163200,204,36,Anzhero-Sudzhensk,0.000000,0.166667,0.141042,0.133333,1.000000,0.250000,0.116242,28.0,0.178571
163201,203,18,Moscow,0.138777,0.138045,0.141042,0.139542,0.131974,0.142722,0.116242,46.0,0.130435
163202,202,17,Smolensk,0.000000,0.080000,0.000000,0.166667,0.000000,0.166667,0.000000,70.0,0.114286
163203,201,37,Abakan,0.138777,0.000000,0.000000,0.333333,0.000000,0.000000,0.000000,26.0,0.153846


In [179]:
post_feature

Unnamed: 0,post_id,topic,text_length,likes,posts_views,likes_share
0,7319,movie,790,720.0,6585.0,10.0
1,7318,movie,728,680.0,6785.0,10.0
2,7317,movie,636,731.0,6803.0,10.0
3,7316,movie,800,677.0,6572.0,10.0
4,7315,movie,803,2619.0,16940.0,15.0
...,...,...,...,...,...,...
7018,5,business,889,1153.0,8583.0,13.0
7019,4,business,1026,1171.0,8158.0,14.0
7020,3,business,3408,1122.0,8412.0,13.0
7021,2,business,2701,637.0,7495.0,8.0


In [180]:
user_data_feature

Unnamed: 0,user_id,gender,age,country,city,os,source,business,covid,entertainment,movie,politics,sport,tech,views,ratio of likes posts
0,168551,0,38,Russia,Moscow,iOS,organic,0.000000,0.250000,0.40,0.125000,0.000000,0.000000,0.00,30,0.166667
1,168549,0,18,Russia,Tula,Android,organic,0.000000,0.000000,0.00,0.100000,0.000000,0.000000,0.00,34,0.029412
2,168547,1,21,Russia,Magnitogorsk,Android,organic,0.000000,0.043478,0.00,0.090909,0.200000,0.000000,0.00,65,0.061538
3,168545,1,25,Russia,Berezniki,iOS,organic,0.000000,0.000000,0.25,0.083333,0.000000,0.333333,0.00,36,0.083333
4,168544,1,18,Ukraine,Odesa,iOS,organic,0.000000,0.100000,0.00,0.142857,0.000000,0.500000,0.00,27,0.111111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108048,205,0,32,Russia,Dugulubgey,Android,ads,0.333333,0.050000,0.00,0.200000,0.000000,0.000000,0.00,57,0.070175
108049,204,0,36,Russia,Anzhero-Sudzhensk,Android,ads,0.000000,0.166667,0.00,0.133333,1.000000,0.250000,0.00,28,0.178571
108050,202,1,17,Russia,Smolensk,Android,ads,0.000000,0.080000,0.00,0.166667,0.000000,0.166667,0.00,70,0.114286
108051,201,0,37,Russia,Abakan,Android,ads,0.000000,0.000000,0.00,0.333333,0.000000,0.000000,0.00,26,0.153846


create a training dataset

In [181]:
action_users

Unnamed: 0,timestamp,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source
0,2021-12-29 23:51:06,147682,6676,view,0,0,24,Russia,Trubchevsk,1,Android,organic
1,2021-12-29 23:48:26,147682,5180,view,0,0,24,Russia,Trubchevsk,1,Android,organic
2,2021-12-29 23:45:27,147682,5967,view,0,0,24,Russia,Trubchevsk,1,Android,organic
3,2021-12-29 23:43:55,147682,1983,view,0,0,24,Russia,Trubchevsk,1,Android,organic
4,2021-12-29 23:43:08,147682,6699,view,0,0,24,Russia,Trubchevsk,1,Android,organic
...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,2021-12-23 14:36:49,8874,1953,view,0,0,16,Russia,Petrozavodsk,2,Android,ads
4999996,2021-12-23 14:41:23,53169,5665,view,0,1,17,Russia,Tyumen,1,Android,ads
4999997,2021-12-23 14:39:42,53169,6544,view,0,1,17,Russia,Tyumen,1,Android,ads
4999998,2021-12-23 14:38:43,53169,1499,view,0,1,17,Russia,Tyumen,1,Android,ads


In [164]:
actions_target = action_users \
    .groupby(['user_id', 'post_id'], as_index=False) \
    .agg({'target':'max', 'timestamp':'max'})

In [182]:
actions_target.head()

Unnamed: 0,user_id,post_id,target,timestamp
0,200,37,0,2021-12-29 15:11:05
1,200,167,0,2021-12-29 15:01:08
2,200,213,0,2021-12-24 14:02:13
3,200,994,1,2021-12-29 15:18:42
4,200,1122,0,2021-12-29 15:23:54


In [183]:
train_data = actions_target.merge(post_feature, on = 'post_id', how='inner')

In [184]:
train_data = train_data.merge(user_data_feature, on = 'user_id', how='inner')

In [185]:
train_data

Unnamed: 0,user_id,post_id,target,timestamp,topic,text_length,likes,posts_views,likes_share,gender,...,source,business,covid,entertainment,movie,politics,sport,tech,views,ratio of likes posts
0,200,37,0,2021-12-29 15:11:05,business,3634,2810.0,22136.0,12.0,1,...,ads,0.0,0.1875,0.0,0.071429,0.166667,0.0,0.25,51,0.117647
1,200,167,0,2021-12-29 15:01:08,business,1940,1154.0,8310.0,13.0,1,...,ads,0.0,0.1875,0.0,0.071429,0.166667,0.0,0.25,51,0.117647
2,200,213,0,2021-12-24 14:02:13,business,2923,2885.0,22261.0,12.0,1,...,ads,0.0,0.1875,0.0,0.071429,0.166667,0.0,0.25,51,0.117647
3,200,994,1,2021-12-29 15:18:42,politics,2572,2829.0,22375.0,12.0,1,...,ads,0.0,0.1875,0.0,0.071429,0.166667,0.0,0.25,51,0.117647
4,200,1122,0,2021-12-29 15:23:54,politics,1995,2816.0,22532.0,12.0,1,...,ads,0.0,0.1875,0.0,0.071429,0.166667,0.0,0.25,51,0.117647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4984828,120800,3554,0,2021-12-23 14:39:04,covid,140,670.0,7071.0,9.0,1,...,organic,0.0,0.0000,0.0,0.000000,0.000000,0.0,0.00,3,0.000000
4984829,120800,2670,0,2021-12-23 14:42:03,covid,134,718.0,6822.0,10.0,1,...,organic,0.0,0.0000,0.0,0.000000,0.000000,0.0,0.00,3,0.000000
4984830,114000,2695,0,2021-12-23 14:37:53,covid,140,733.0,6772.0,10.0,1,...,organic,0.0,0.0000,0.0,0.000000,0.000000,0.0,0.00,3,0.000000
4984831,114000,6355,0,2021-12-23 14:42:03,movie,837,663.0,6752.0,9.0,1,...,organic,0.0,0.0000,0.0,0.000000,0.000000,0.0,0.00,3,0.000000


In [186]:
train_data.to_csv("train_data", sep=";", index=False)