In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install polars



In [3]:
from collections import defaultdict, Counter
from typing import List, Dict

from tqdm import tqdm
import pandas as pd
import polars as pl

In [7]:
train = pd.read_csv("/content/drive/MyDrive/kddcup2023/data/raw/sessions_train.csv")
test1 = pd.read_csv("/content/drive/MyDrive/kddcup2023/data/raw/sessions_test_task1_phase1.csv")
test2 = pd.read_csv("/content/drive/MyDrive/kddcup2023/data/raw/sessions_test_task1.csv")

train = pl.from_pandas(train)
test1 = pl.from_pandas(test1)
test2 = pl.from_pandas(test2)

In [8]:
LOCALES = ["DE", "UK", "JP"]
train = train.filter(pl.col("locale").is_in(LOCALES))
test1 = test1.filter(pl.col("locale").is_in(LOCALES))
test2 = test2.filter(pl.col("locale").is_in(LOCALES))

In [9]:
%%time
def str2list(s):
    s = s.replace("[", "").replace("]", "").replace("'", "").replace("\n", " ").replace("\r", " ")
    s = s.split() # "" split
    return s

train = train.with_columns(pl.col("prev_items").apply(str2list).alias("prev_items"))
test1 = test1.with_columns(pl.col("prev_items").apply(str2list).alias("prev_items"))
test2 = test2.with_columns(pl.col("prev_items").apply(str2list).alias("prev_items"))



CPU times: user 52.1 s, sys: 5.67 s, total: 57.8 s
Wall time: 59.1 s


In [10]:
train = train.with_columns(pl.Series(name="session_id", values=["train_" + str(i) for i in range(len(train))]))
test1 = test1.with_columns(pl.Series(name="session_id", values=["test_phase1_" + str(i) for i in range(len(test1))]))
test2 = test2.with_columns(pl.Series(name="session_id", values=["test_phase2_" + str(i) for i in range(len(test2))]))

In [11]:
train.head()

prev_items,next_item,locale,session_id
list[str],str,str,str
"[""B09W9FND7K"", ""B09JSPLN1M""]","""B09M7GY217""","""DE""","""train_0"""
"[""B076THCGSG"", ""B007MO8IME"", … ""B001B4TKA0""]","""B001B4THSA""","""DE""","""train_1"""
"[""B0B1LGXWDS"", ""B00AZYORS2"", … ""B00AZYORS2""]","""B0767DTG2Q""","""DE""","""train_2"""
"[""B09XMTWDVT"", ""B0B4MZZ8MB"", … ""B0B71CHT1L""]","""B0B4R9NN4B""","""DE""","""train_3"""
"[""B09Y5CSL3T"", ""B09Y5DPTXN"", ""B09FKD61R8""]","""B0BGVBKWGZ""","""DE""","""train_4"""


In [12]:
train.describe()

describe,prev_items,next_item,locale,session_id
str,str,str,str,str
"""count""","""3272716""","""3272716""","""3272716""","""3272716"""
"""null_count""","""0""","""0""","""0""","""0"""
"""mean""",,,,
"""std""",,,,
"""min""",,"""000647988X""","""DE""","""train_0"""
"""25%""",,,,
"""50%""",,,,
"""75%""",,,,
"""max""",,"""B0BLKGX588""","""UK""","""train_999999"""


In [13]:
test1.head()

prev_items,locale,session_id
list[str],str,str
"[""B08V12CT4C"", ""B08V1KXBQD"", … ""B099NQFMG7""]","""DE""","""test_phase1_0"""
"[""B00R9R5ND6"", ""B00R9RZ9ZS"", ""B00R9RZ9ZS""]","""DE""","""test_phase1_1"""
"[""B07YSRXJD3"", ""B07G7Q5N6G"", … ""B07G7Q5N6G""]","""DE""","""test_phase1_2"""
"[""B08KQBYV43"", ""3955350843"", … ""3955350843""]","""DE""","""test_phase1_3"""
"[""B09FPTCWMC"", ""B09FPTQP68"", … ""B09J945WQR""]","""DE""","""test_phase1_4"""


In [14]:
test1.describe()

describe,prev_items,locale,session_id
str,str,str,str
"""count""","""316971""","""316971""","""316971"""
"""null_count""","""0""","""0""","""0"""
"""mean""",,,
"""std""",,,
"""min""",,"""DE""","""test_phase1_0"""
"""25%""",,,
"""50%""",,,
"""75%""",,,
"""max""",,"""UK""","""test_phase1_99…"


In [15]:
test2.head()

prev_items,locale,session_id
list[str],str,str
"[""B087VLP2RT"", ""B09BRQSHYH"", ""B099KW4ZLV""]","""DE""","""test_phase2_0"""
"[""B08XW4W667"", ""B096VMCJYF"", ""B096VMCJYF""]","""DE""","""test_phase2_1"""
"[""B09Z4T2GJ3"", ""B09Z3FBXMB"", … ""B09Z4PYG8Q""]","""DE""","""test_phase2_2"""
"[""B07T6Y2HG7"", ""B07T2NBLX9"", ""B07Y1G5F3Y""]","""DE""","""test_phase2_3"""
"[""B0B2DRKZ6X"", ""B0B2DRKZ6X"", ""B0B2DRKZ6X""]","""DE""","""test_phase2_4"""


In [16]:
test2.describe()

describe,prev_items,locale,session_id
str,str,str,str
"""count""","""316972""","""316972""","""316972"""
"""null_count""","""0""","""0""","""0"""
"""mean""",,,
"""std""",,,
"""min""",,"""DE""","""test_phase2_0"""
"""25%""",,,
"""50%""",,,
"""75%""",,,
"""max""",,"""UK""","""test_phase2_99…"


In [17]:
train.write_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/task1/train_task1.parquet")

In [18]:
test1.write_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/task1/test_task1_phase1.parquet")

In [19]:
test2.write_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/task1/test_task1_phase2.parquet")