## Import necessary libraries

In [11]:
# Data imports
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
import ast
import re

SEED = 42
warnings.filterwarnings("ignore")
np.random.seed(SEED)

## Load data from small matrix because big matrix breaks the ram

In [12]:
data_path = "data_final_project/KuaiRec 2.0/data/"
small_matrix = pd.read_csv(data_path + "small_matrix.csv").drop_duplicates().dropna()
user_features = pd.read_csv(data_path + "user_features.csv").drop_duplicates().dropna()
item_categories = pd.read_csv(data_path + "item_categories.csv").drop_duplicates().dropna()
caption_category = pd.read_csv(data_path + "kuairec_caption_category.csv", lineterminator='\n').drop_duplicates().dropna()

## Fix negative category id to be the 30th one

In [13]:
categories = caption_category["first_level_category_id"].unique()
categories.sort()
categories

array([-124,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
         11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
         22,   23,   24,   25,   26,   27,   28,   29,   31,   32,   33,
         34,   35,   36,   37,   38,   39])

In [14]:
caption_category["first_level_category_id"] = [x if x >= 0 else 30 for x in caption_category["first_level_category_id"]]

## Data cleaning

### small_matrix / big_matrix
- time is kept for train / validation / test splitting
- watch_ratio 
    - clean unrealistic data (more than 5 rewatch) 
    - split the interation in 3 categories:
        - negative (less than 0.65)
        - neutral (between 0.65 and 1.2)
        - like (more than 1.2)
- video duration
    - remove irrelevant data (video lasts for more than 2.6 minutes)
    - split the interaction in 3 categories:
        - short video (less than 7 seconds)
        - medium video (between 7 and 16 seconds)
        - long video (longer than 16 seconds)

All user_id / video_id which are not present in the "user_features" / "caption_category" dataframe are removed since they lack information.

In [15]:
# Copy the dataframe while only keeping the relevant columns
small_matrix_cleaned = small_matrix.copy()
small_matrix_cleaned = small_matrix_cleaned[["user_id", "video_id", "watch_ratio", "time", "video_duration"]]

# Remove irrelevant / unrealistic data
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned["video_duration"] <= 160000]
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned["watch_ratio"] <= 5]

# Categorize watch_ratio
small_matrix_cleaned["like"] = np.where(small_matrix_cleaned["watch_ratio"] <= 0.65, -1,
                                np.where(small_matrix_cleaned["watch_ratio"] <= 1.2, 0, 1))

# Categorize video_length
small_matrix_cleaned["video_length"] = np.where(small_matrix_cleaned["video_duration"] <= 7000, -1,
                                np.where(small_matrix_cleaned["video_duration"] <= 16000, 0, 1))

# Drop useless columns
#small_matrix_cleaned = small_matrix_cleaned.drop(columns=["watch_ratio", "video_duration"])
small_matrix_cleaned = small_matrix_cleaned.drop(columns=["video_duration"])

# Remove users which lack information
known_user_ids = user_features["user_id"].unique()
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned["user_id"].isin(known_user_ids)]

# Remove videos which lack information
known_video_ids = caption_category["video_id"].unique()
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned["video_id"].isin(known_video_ids)]

# Check dataframe
small_matrix_cleaned.head()

Unnamed: 0,user_id,video_id,watch_ratio,time,like,video_length
0,14,148,0.722103,2020-07-05 05:27:48.378,0,-1
1,14,183,1.907377,2020-07-05 05:28:00.057,1,-1
2,14,3649,2.063311,2020-07-05 05:29:09.479,1,0
3,14,5262,0.566388,2020-07-05 05:30:43.285,-1,0
4,14,8234,0.418364,2020-07-05 05:35:43.459,-1,0


### Item_categories
- Explode video tags and create a one hot encoded vector containing the tags of each video

All video_id which are not present in the "caption_category" dataframe are removed since they lack information.

In [16]:
# Copy the dataframe and fix its format
item_categories_cleaned = item_categories.copy()
item_categories_cleaned["feat"] = item_categories_cleaned["feat"].apply(ast.literal_eval)

# Explode item tag lists
item_categories_cleaned = item_categories_cleaned.explode("feat")

# Make it a one hot encoded vector
item_categories_cleaned = pd.crosstab(item_categories_cleaned["video_id"], item_categories_cleaned["feat"])

# Set columns' names
item_categories_cleaned.columns = [f"tag_{i}" for i in range(31)]

# Fix indexes
item_categories_cleaned = item_categories_cleaned.reset_index()

# Remove videos which lack information
item_categories_cleaned = item_categories_cleaned[item_categories_cleaned["video_id"].isin(known_video_ids)]

# Check dataframe
item_categories_cleaned.head()

Unnamed: 0,video_id,tag_0,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8,...,tag_21,tag_22,tag_23,tag_24,tag_25,tag_26,tag_27,tag_28,tag_29,tag_30
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,4,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### User_features
- Keep all encoded features since all decoded features seem useless in our case
- Scale all encoded features

In [17]:
# Copy the dataframe while only keeping the relevant columns
user_features_cleaned = user_features.copy()
user_features_cleaned = user_features_cleaned[["user_id"] + [f'onehot_feat{i}' for i in range(18)]]

# Rescale data
for row in [f'onehot_feat{i}' for i in range(18)]:
    scaler = MinMaxScaler((-1, 1))
    user_features_cleaned[row] = scaler.fit_transform(user_features_cleaned[[row]])

# Check dataframe
user_features_cleaned.head()

Unnamed: 0,user_id,onehot_feat0,onehot_feat1,onehot_feat2,onehot_feat3,onehot_feat4,onehot_feat5,onehot_feat6,onehot_feat7,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0,-1.0,-0.666667,0.172414,0.19141,-0.636364,-1.0,0.0,-0.73913,0.085546,1.0,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,-1.0,0.0,0.724138,0.906629,-1.0,-1.0,0.0,-0.73913,0.097345,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,2,-1.0,1.0,-0.448276,-0.2493,-1.0,-1.0,-1.0,-0.913043,-0.699115,-0.333333,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,3,-1.0,-0.666667,-0.448276,-0.475257,-1.0,-1.0,-1.0,0.478261,0.480826,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,4,-1.0,-0.666667,-0.448276,-0.409897,-0.818182,-1.0,0.0,1.0,-0.415929,0.333333,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


### Caption_category
- Explode first_level_category_id and create a one hot encoded vector containing the first level category id of each video

In [18]:
# Copy the dataframe while only keeping the relevant columns
caption_category_cleaned = caption_category.copy()
caption_category_cleaned = caption_category_cleaned[["video_id", "first_level_category_id"]]

# Make the first level category id a one hot encoded vector
caption_category_cleaned = pd.crosstab(caption_category_cleaned["video_id"], caption_category_cleaned["first_level_category_id"])

# Set columns' names
caption_category_cleaned.columns = [f"category_{i}" for i in range(39)]

# Fix indexes
caption_category_cleaned = caption_category_cleaned.reset_index()

# Check dataframe
caption_category_cleaned.head()

Unnamed: 0,video_id,category_0,category_1,category_2,category_3,category_4,category_5,category_6,category_7,category_8,...,category_29,category_30,category_31,category_32,category_33,category_34,category_35,category_36,category_37,category_38
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Save Data

In [19]:
small_matrix_cleaned.to_parquet("data/small_matrix_cleaned.parquet")
user_features.to_parquet("data/user_features.parquet")