## Import necessary libraries

In [1]:
# Data imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import ast
from pathlib import Path

# Prepare environment

In [2]:
SEED = 42
np.random.seed(SEED)

KUAIREC_PATH = "data_final_project/KuaiRec 2.0/data/"

CLEANED_DATA_PATH = "data/cleaned/"
Path(CLEANED_DATA_PATH).mkdir(parents=True, exist_ok=True)

## Load data

In [3]:
small_matrix = pd.read_csv(KUAIREC_PATH + "small_matrix.csv").drop_duplicates().dropna()
big_matrix = pd.read_csv(KUAIREC_PATH + "big_matrix.csv").drop_duplicates().dropna()
user_features = pd.read_csv(KUAIREC_PATH + "user_features.csv").drop_duplicates().dropna()
item_categories = pd.read_csv(KUAIREC_PATH + "item_categories.csv").drop_duplicates().dropna()
item_daily_features = pd.read_csv(KUAIREC_PATH + "item_daily_features.csv").drop_duplicates().dropna()
social_network = pd.read_csv(KUAIREC_PATH + "social_network.csv").drop_duplicates().dropna()
caption_category = pd.read_csv(KUAIREC_PATH + "kuairec_caption_category.csv", lineterminator='\n').drop_duplicates().dropna()

### Check Loaded Dataframes

In [4]:
small_matrix

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1.593898e+09,0.722103
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1.593898e+09,1.907377
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1.593898e+09,2.063311
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1.593898e+09,0.566388
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1.593899e+09,0.418364
...,...,...,...,...,...,...,...,...
4676370,7162,9177,5315,37205,2020-09-01 20:06:35.984,20200901.0,1.598962e+09,0.142857
4676371,7162,4987,10085,8167,2020-09-02 14:44:51.342,20200902.0,1.599029e+09,1.234848
4676372,7162,7988,50523,49319,2020-09-03 08:45:01.474,20200903.0,1.599094e+09,1.024412
4676373,7162,6533,2190,8000,2020-09-04 22:56:32.021,20200904.0,1.599231e+09,0.273750


In [5]:
user_features

Unnamed: 0,user_id,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,follow_user_num_range,fans_user_num,fans_user_num_range,friend_user_num,...,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0,high_active,0,0,0,5,"(0,10]",0,0,0,...,184,6,3,0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,full_active,0,0,0,386,"(250,500]",4,"[1,10)",2,...,186,6,2,0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,full_active,0,0,0,27,"(10,50]",0,0,0,...,51,2,3,0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,full_active,0,0,0,16,"(10,50]",0,0,0,...,251,3,2,0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,full_active,0,0,0,122,"(100,150]",4,"[1,10)",0,...,99,4,2,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7171,7171,full_active,0,0,1,52,"(50,100]",1,"[1,10)",0,...,259,1,4,0,1.0,0.0,0.0,0.0,0.0,0.0
7172,7172,full_active,0,0,0,45,"(10,50]",2,"[1,10)",2,...,11,2,0,0,1.0,0.0,0.0,0.0,0.0,0.0
7173,7173,full_active,0,0,0,615,500+,3,"[1,10)",2,...,51,2,2,0,1.0,0.0,0.0,0.0,0.0,0.0
7174,7174,full_active,0,0,0,959,500+,0,0,0,...,107,3,2,0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
item_categories

Unnamed: 0,video_id,feat
0,0,[8]
1,1,"[27, 9]"
2,2,[9]
3,3,[26]
4,4,[5]
...,...,...
10723,10723,[11]
10724,10724,[2]
10725,10725,[15]
10726,10726,[19]


In [7]:
item_daily_features

Unnamed: 0,video_id,date,author_id,video_type,upload_dt,upload_type,visible_status,video_duration,video_width,video_height,...,download_cnt,download_user_num,report_cnt,report_user_num,reduce_similar_cnt,reduce_similar_user_num,collect_cnt,collect_user_num,cancel_collect_cnt,cancel_collect_user_num
23,0,20200728,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,3,3,0,0,3,3,2.0,2.0,0.0,0.0
24,0,20200729,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,5,5,0,0,5,5,7.0,6.0,4.0,4.0
25,0,20200730,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,3,3,0,0,1,1,6.0,6.0,3.0,3.0
26,0,20200731,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,9,9,0,0,5,5,3.0,3.0,2.0,2.0
27,0,20200801,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,9,9,0,0,3,3,7.0,7.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343336,10723,20200905,236,NORMAL,2020-09-05,ShortImport,public,4833.0,720,1280,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
343337,10724,20200905,5271,NORMAL,2020-09-05,LongImport,public,54720.0,720,1280,...,1,1,0,0,0,0,0.0,0.0,0.0,0.0
343338,10725,20200905,1924,NORMAL,2020-09-05,ShortImport,public,15800.0,576,1024,...,5,5,0,0,4,4,0.0,0.0,0.0,0.0
343339,10726,20200905,7604,NORMAL,2020-09-05,ShortImport,public,5132.0,528,960,...,2,2,0,0,1,1,0.0,0.0,0.0,0.0


In [8]:
social_network

Unnamed: 0,user_id,friend_list
0,3371,[2975]
1,24,[2665]
2,4402,[38]
3,4295,[4694]
4,7087,[7117]
...,...,...
467,2331,[4345]
468,6163,[1332]
469,3732,[670]
470,3335,[202]


In [9]:
caption_category

Unnamed: 0,video_id,manual_cover_text,caption,topic_tag,first_level_category_id,first_level_category_name,second_level_category_id,second_level_category_name,third_level_category_id,third_level_category_name
0,0,UNKNOWN,精神小伙路难走 程哥你狗粮慢点撒,[],8,颜值,673,颜值随拍,-124,UNKNOWN
2,2,UNKNOWN,晚饭后，运动一下！,[],9,喜剧,727,搞笑互动,-124,UNKNOWN
3,3,UNKNOWN,我平淡无奇，惊艳不了时光，温柔不了岁月，我只想漫无目的的走走，努力发笔小财，给自己买花 自己长大.,[],26,摄影,686,主题摄影,2434,景物摄影
4,4,五爱街最美美女 一天1q,#搞笑 #感谢快手我要上热门 #五爱市场 这真是完美搭配啊！,"[五爱市场,感谢快手我要上热门,搞笑]",5,时尚,737,营销售卖,2596,女装
5,5,UNKNOWN,“你们吵的越狠 他们的手就握的越紧” #文轩 #刘耀文 #宋亚轩 #顾子璇...,"[刘耀文,宋亚轩,文轩,顾子璇是樱桃吖,顾子璇超级喜欢文轩]",6,明星娱乐,667,娱乐八卦,2375,饭制
...,...,...,...,...,...,...,...,...,...,...
10722,10722,UNKNOWN,#2020新款 #民族复古风 #原创视频 #作品推广 #感谢快手官方大大送上热门,"[2020新款,作品推广,原创视频,感谢快手官方大大送上热门,民族复古风]",5,时尚,737,营销售卖,-124,UNKNOWN
10723,10723,UNKNOWN,昨天爱你，今天爱你，明天也爱你，丫头，别担心，我以后都会爱你，我的小傻瓜@公主没烦恼 、(O...,[],33,自拍,-124,UNKNOWN,-124,UNKNOWN
10724,10724,UNKNOWN,#感谢推广小助手 #感谢快手绿色平台 #,"[感谢快手绿色平台,感谢推广小助手]",6,明星娱乐,-124,UNKNOWN,-124,UNKNOWN
10726,10726,老人言,老人言，喜欢留个关注加红心 #老人言 @今天拍点啥(O840386039) @快手活动中...,[老人言],38,读书,696,文学赏析,2477,民间俗语


## Fix negative category id to be the 30th one

In [10]:
categories = caption_category["first_level_category_id"].unique()
categories.sort()
categories

array([-124,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
         11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
         22,   23,   24,   25,   26,   27,   28,   29,   31,   32,   33,
         34,   35,   36,   37,   38,   39])

In [11]:
caption_category["first_level_category_id"] = [x if x >= 0 else 30 for x in caption_category["first_level_category_id"]]

## Data cleaning

### Remove ads and private videos

In [12]:
item_daily_features_cleaned = item_daily_features[item_daily_features["video_type"] == "NORMAL"]
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["visible_status"] == "public"]
item_daily_features_cleaned

Unnamed: 0,video_id,date,author_id,video_type,upload_dt,upload_type,visible_status,video_duration,video_width,video_height,...,download_cnt,download_user_num,report_cnt,report_user_num,reduce_similar_cnt,reduce_similar_user_num,collect_cnt,collect_user_num,cancel_collect_cnt,cancel_collect_user_num
23,0,20200728,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,3,3,0,0,3,3,2.0,2.0,0.0,0.0
24,0,20200729,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,5,5,0,0,5,5,7.0,6.0,4.0,4.0
25,0,20200730,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,3,3,0,0,1,1,6.0,6.0,3.0,3.0
26,0,20200731,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,9,9,0,0,5,5,3.0,3.0,2.0,2.0
27,0,20200801,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,9,9,0,0,3,3,7.0,7.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343336,10723,20200905,236,NORMAL,2020-09-05,ShortImport,public,4833.0,720,1280,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
343337,10724,20200905,5271,NORMAL,2020-09-05,LongImport,public,54720.0,720,1280,...,1,1,0,0,0,0,0.0,0.0,0.0,0.0
343338,10725,20200905,1924,NORMAL,2020-09-05,ShortImport,public,15800.0,576,1024,...,5,5,0,0,4,4,0.0,0.0,0.0,0.0
343339,10726,20200905,7604,NORMAL,2020-09-05,ShortImport,public,5132.0,528,960,...,2,2,0,0,1,1,0.0,0.0,0.0,0.0


### Set known user and video ids which are known throughout all databases to avoid NaN

In [13]:
known_user_ids = set(user_features["user_id"].unique())
known_video_ids = set(caption_category["video_id"].unique()) & set(item_daily_features_cleaned["video_id"].unique())

### small / big matrix
- time is kept for calculating features on prior interactions
- watch_ratio 
    - clean unrealistic data (more than 5 rewatch) 
    - split the interation in 3 categories:
        - negative (less than 0.65)
        - neutral (between 0.65 and 1.2)
        - like (more than 1.2)
- video duration
    - remove irrelevant data (video lasts for more than 2.6 minutes)
    - split the interaction in 3 categories:
        - short video (less than 7 seconds)
        - medium video (between 7 and 16 seconds)
        - long video (longer than 16 seconds)

All user_id / video_id which are not present in the "user_features" / "caption_category" dataframe are removed since they lack information.

In [14]:
def clean_matrix(matrix: pd.DataFrame, known_user_ids: set, known_video_ids: set):
    # Copy the dataframe while only keeping the relevant columns
    matrix_cleaned = matrix[["user_id", "video_id", "watch_ratio", "time", "video_duration"]].copy()

    # Fix time format
    matrix_cleaned["time"] = pd.to_datetime(matrix_cleaned["time"])

    # Remove irrelevant / unrealistic data
    matrix_cleaned = matrix_cleaned[matrix_cleaned["video_duration"] <= 160000]
    matrix_cleaned = matrix_cleaned[matrix_cleaned["watch_ratio"] <= 5]

    # Categorize watch_ratio
    matrix_cleaned["like"] = np.where(
        matrix_cleaned["watch_ratio"] <= 0.65,
        -1,
        np.where(matrix_cleaned["watch_ratio"] <= 1.2, 0, 1),
    )

    # Categorize video_length
    matrix_cleaned["video_length"] = np.where(
        matrix_cleaned["video_duration"] <= 7000,
        -1,
        np.where(matrix_cleaned["video_duration"] <= 16000, 0, 1),
    )

    # Drop useless columns
    # small_matrix_cleaned = small_matrix_cleaned.drop(columns=["watch_ratio", "video_duration"])
    matrix_cleaned = matrix_cleaned.drop(columns=["video_duration"])

    # Remove users which lack information
    matrix_cleaned = matrix_cleaned[matrix_cleaned["user_id"].isin(known_user_ids)]

    # Remove videos which lack information
    matrix_cleaned = matrix_cleaned[matrix_cleaned["video_id"].isin(known_video_ids)]

    matrix_cleaned = matrix_cleaned.drop_duplicates()

    return matrix_cleaned

In [15]:
small_matrix_cleaned = clean_matrix(small_matrix, known_user_ids, known_video_ids)
small_matrix_cleaned

Unnamed: 0,user_id,video_id,watch_ratio,time,like,video_length
0,14,148,0.722103,2020-07-05 05:27:48.378,0,-1
1,14,183,1.907377,2020-07-05 05:28:00.057,1,-1
2,14,3649,2.063311,2020-07-05 05:29:09.479,1,0
3,14,5262,0.566388,2020-07-05 05:30:43.285,-1,0
4,14,8234,0.418364,2020-07-05 05:35:43.459,-1,0
...,...,...,...,...,...,...
4676370,7162,9177,0.142857,2020-09-01 20:06:35.984,-1,1
4676371,7162,4987,1.234848,2020-09-02 14:44:51.342,1,0
4676372,7162,7988,1.024412,2020-09-03 08:45:01.474,0,1
4676373,7162,6533,0.273750,2020-09-04 22:56:32.021,-1,0


In [16]:
big_matrix_cleaned = clean_matrix(big_matrix, known_user_ids, known_video_ids)
big_matrix_cleaned

Unnamed: 0,user_id,video_id,watch_ratio,time,like,video_length
0,0,3649,1.273397,2020-07-05 00:08:23.438,1,0
2,0,5262,0.107613,2020-07-05 00:16:06.687,-1,0
3,0,1963,0.089885,2020-07-05 00:20:26.792,-1,0
4,0,8234,0.078000,2020-07-05 00:43:05.128,-1,0
5,0,8228,1.572295,2020-07-05 01:00:25.500,1,0
...,...,...,...,...,...,...
12530799,7175,6597,1.004462,2020-09-05 06:35:01.104,0,0
12530800,7175,6630,0.313389,2020-09-05 15:00:33.379,-1,0
12530803,7175,10360,0.340597,2020-09-05 19:10:29.041,-1,0
12530804,7175,10360,0.913400,2020-09-05 19:10:36.995,0,0


### Item_categories
- Explode video tags and create a one hot encoded vector containing the tags of each video

All video_id which are not present in the "caption_category" dataframe are removed since they lack information.

In [17]:
# Copy the dataframe and fix its format
item_categories_cleaned = item_categories.copy()
item_categories_cleaned["feat"] = item_categories_cleaned["feat"].apply(ast.literal_eval)

# Explode item tag lists
item_categories_cleaned = item_categories_cleaned.explode("feat")

# Make it a one hot encoded vector
# item_categories_cleaned = pd.crosstab(item_categories_cleaned["video_id"], item_categories_cleaned["feat"])

# Set columns' names
# item_categories_cleaned.columns = [f"tag_{i}" for i in range(31)]

# Fix indexes
# item_categories_cleaned = item_categories_cleaned.reset_index()

# For each video, compute the average watch_ratio of this video
video_watch_ratio_mean = big_matrix_cleaned.groupby("video_id")["watch_ratio"].mean()
item_categories_cleaned["video_watch_ratio_mean"] = item_categories_cleaned["video_id"].map(video_watch_ratio_mean)

# Remove videos which lack information
item_categories_cleaned = item_categories_cleaned[item_categories_cleaned["video_id"].isin(known_video_ids)]

# Check dataframe
item_categories_cleaned

Unnamed: 0,video_id,feat,video_watch_ratio_mean
0,0,8,1.344039
2,2,9,1.163905
4,4,5,0.617457
5,5,6,0.862989
6,6,19,1.483494
...,...,...,...
10722,10722,5,1.167776
10723,10723,11,1.411720
10724,10724,2,1.450592
10726,10726,19,1.810170


### Social network

In [18]:
# Copy the dataframe and fix its format
social_network_cleaned = social_network.copy()
social_network_cleaned["friend_list"] = social_network_cleaned["friend_list"].apply(ast.literal_eval)

# Remove users which lack information
#social_network_cleaned = social_network_cleaned[social_network_cleaned["user_id"].isin(known_user_ids)]

# Check dataframe
social_network_cleaned.head()

Unnamed: 0,user_id,friend_list
0,3371,[2975]
1,24,[2665]
2,4402,[38]
3,4295,[4694]
4,7087,[7117]


### User_features
- Keep all encoded features since all decoded features seem useless in our case
- Scale all encoded features

In [19]:
# Copy the dataframe
user_features_cleaned = user_features.copy()

# Rescale data
for row in [f'onehot_feat{i}' for i in range(18)]:
    scaler = MinMaxScaler((-1, 1))
    user_features_cleaned[row] = scaler.fit_transform(user_features_cleaned[[row]])

# Encode columns to be easier to work with
user_features_cleaned['user_active_degree'] = pd.factorize(user_features_cleaned['user_active_degree'])[0]
user_features_cleaned['follow_user_num_range'] = pd.factorize(user_features_cleaned['follow_user_num_range'])[0]
user_features_cleaned['fans_user_num_range'] = pd.factorize(user_features_cleaned['fans_user_num_range'])[0]
user_features_cleaned['friend_user_num_range'] = pd.factorize(user_features_cleaned['friend_user_num_range'])[0]
user_features_cleaned['register_days_range'] = pd.factorize(user_features_cleaned['register_days_range'])[0]

# Check dataframe
user_features_cleaned

Unnamed: 0,user_id,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,follow_user_num_range,fans_user_num,fans_user_num_range,friend_user_num,...,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0,0,0,0,0,5,0,0,0,0,...,0.085546,1.000000,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,1,0,0,0,386,1,4,1,2,...,0.097345,1.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,2,1,0,0,0,27,2,0,0,0,...,-0.699115,-0.333333,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,3,1,0,0,0,16,2,0,0,0,...,0.480826,0.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,4,1,0,0,0,122,3,4,1,0,...,-0.415929,0.333333,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7171,7171,1,0,0,1,52,5,1,1,0,...,0.528024,-0.666667,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7172,7172,1,0,0,0,45,2,2,1,2,...,-0.935103,-0.333333,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7173,7173,1,0,0,0,615,4,3,1,2,...,-0.699115,-0.333333,0.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7174,7174,1,0,0,0,959,4,0,0,0,...,-0.368732,0.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


### Caption_category
- Explode first_level_category_id and create a one hot encoded vector containing the first level category id of each video

In [20]:
# Copy the dataframe while only keeping the relevant columns
caption_category_cleaned = caption_category[["video_id", "first_level_category_id"]].copy()

# Make the first level category id a one hot encoded vector
#caption_category_cleaned = pd.crosstab(caption_category_cleaned["video_id"], caption_category_cleaned["first_level_category_id"])
#caption_category_cleaned = caption_category_cleaned.merge(caption_category["first_level_category_id"], on="video_id", how="left")

# Set columns' names
#caption_category_cleaned.columns = [f"category_{i}" for i in range(39)]

# Fix indexes
#caption_category_cleaned = caption_category_cleaned.reset_index()

# Check dataframe
caption_category_cleaned

Unnamed: 0,video_id,first_level_category_id
0,0,8
2,2,9
3,3,26
4,4,5
5,5,6
...,...,...
10722,10722,5
10723,10723,33
10724,10724,6
10726,10726,38


### Item daily features
- Remove negative play counts

In [21]:
item_daily_features_cleaned["video_duration"] = item_daily_features_cleaned["video_duration"][item_daily_features_cleaned["video_duration"] > 0]
item_daily_features_cleaned

Unnamed: 0,video_id,date,author_id,video_type,upload_dt,upload_type,visible_status,video_duration,video_width,video_height,...,download_cnt,download_user_num,report_cnt,report_user_num,reduce_similar_cnt,reduce_similar_user_num,collect_cnt,collect_user_num,cancel_collect_cnt,cancel_collect_user_num
23,0,20200728,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,3,3,0,0,3,3,2.0,2.0,0.0,0.0
24,0,20200729,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,5,5,0,0,5,5,7.0,6.0,4.0,4.0
25,0,20200730,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,3,3,0,0,1,1,6.0,6.0,3.0,3.0
26,0,20200731,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,9,9,0,0,5,5,3.0,3.0,2.0,2.0
27,0,20200801,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,9,9,0,0,3,3,7.0,7.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343336,10723,20200905,236,NORMAL,2020-09-05,ShortImport,public,4833.0,720,1280,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
343337,10724,20200905,5271,NORMAL,2020-09-05,LongImport,public,54720.0,720,1280,...,1,1,0,0,0,0,0.0,0.0,0.0,0.0
343338,10725,20200905,1924,NORMAL,2020-09-05,ShortImport,public,15800.0,576,1024,...,5,5,0,0,4,4,0.0,0.0,0.0,0.0
343339,10726,20200905,7604,NORMAL,2020-09-05,ShortImport,public,5132.0,528,960,...,2,2,0,0,1,1,0.0,0.0,0.0,0.0


## Save Data

In [22]:
small_matrix_cleaned.to_parquet(CLEANED_DATA_PATH + "small_matrix_cleaned.parquet")
big_matrix_cleaned.to_parquet(CLEANED_DATA_PATH + "big_matrix_cleaned.parquet")
user_features_cleaned.to_parquet(CLEANED_DATA_PATH + "user_features_cleaned.parquet")
item_categories_cleaned.to_parquet(CLEANED_DATA_PATH + "item_categories_cleaned.parquet")
item_daily_features_cleaned.to_parquet(CLEANED_DATA_PATH + "item_daily_features_cleaned.parquet")
social_network_cleaned.to_parquet(CLEANED_DATA_PATH + "social_network_cleaned.parquet")
caption_category_cleaned.to_parquet(CLEANED_DATA_PATH + "caption_category_cleaned.parquet")