In [1]:
# import packages
import os
import dataclasses
from dataclasses import dataclass
import pandas as pd
import numpy as np
from datetime import date
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq
import itertools
import read_data as rd


In [2]:
# set environment folders
data_path = "../data"

In [None]:
# test tuple

test_tp = (1,2)
print(test_tp)
test_tp[0]

In [None]:
# test creating pa.struct type edu class
edu_fields = [("university_raw", pa.string()),
    ("university_name", pa.string())]

edu = pa.struct(edu_fields)


print(edu)

In [None]:

os.listdir("{data_path}/US_EDUC".format(data_path=data_path))

In [4]:
# read user id dataset
ids = pq.read_table("{data_path}/unique_user_id_US_EDUC.parquet".format(data_path=data_path))
# sort user id, later would useful in filtering
ids = ids.sort_by("user_id")
min_id = ids.column("user_id")[0].as_py()
max_id = ids.column("user_id")[-1].as_py()

In [None]:
ids.column("user_id").index(2096705167)

In [5]:
# check min and max id
print("min id is,", min_id)
print("max id is,", max_id)
print(ids.column("user_id")[100000])

min id is, 1000013
max id is, 2096705167
3655629


In [6]:
# read education
edus = pq.ParquetFile("{data_path}/US_EDUC/US_EDUC_user_education.parquet".format(data_path=data_path))
# display edus metadata
edus.metadata

<pyarrow._parquet.FileMetaData object at 0x7f34d365bb50>
  created_by: parquet-cpp-arrow version 14.0.2
  num_columns: 14
  num_rows: 62129675
  num_row_groups: 60
  format_version: 2.6
  serialized_size: 105949

In [11]:
# try reading one rowgroup and check memory usage
group0 = edus.read_row_group(0).to_pandas()

group0 = group0[(group0["user_id"] >= min_id) & (group0["user_id"] <= max_id)]
print(group0.shape)
# num_rows = group0.num_rows
# print(group0.loc[0,:])

print(group0["user_id"])

print(pd.unique(group0["user_id"]))



(1048576, 14)
0          130452445
1          130452445
2          617479683
3          617479683
4          617479683
             ...    
1048571    353784875
1048572    540751562
1048573    540751562
1048574    540751562
1048575    540751562
Name: user_id, Length: 1048576, dtype: int32
[130452445 617479683 203520822 ... 279513580 353784875 540751562]


In [None]:
# check time of constructing and updating one edu instance
edu_test = rd.edu()
edu_test.update_value(group0.loc[0])
print(edu_test)

In [None]:
# test value access in pandas.df
df = pd.DataFrame(np.arange(12).reshape(3, 4),

                  columns=['A', 'B', 'C', 'D'])

print(df)

df[0:2]


In [None]:
# check group0 properties
# group0.slice(length=10).sort_by([("user_id","ascending"),("enddate","ascending")])
# group0.filter(group0.field("user_id") == 323)
group0.field("user_id")
expr = pc.field("user_id") < 323
sub0 = group0.filter(mask = expr, null_selection_behavior = "drop")
sub0.num_rows

In [None]:
print(group0.column("user_id").index(111))
print(group0.column("user_id").index(130452445))

In [None]:
id_expr = pc.field("user_id") == 130452445
ids.filter(id_expr)

In [None]:
# test df construction time

numrows = ids.num_rows
dt = {"user_id": ids.column("user_id"), "user_prof": pd.Series([rd.user()] * numrows), "skill": pd.Series([rd.skill()] * numrows), "edu1": pd.Series([rd.edu()] * numrows), "edu2": pd.Series([rd.edu()] * numrows), "edu3": pd.Series([rd.edu()] * numrows), "edu4": pd.Series([rd.edu()] * numrows), "pos1": pd.Series([rd.pos()] * numrows), "pos2": pd.Series([rd.pos()] * numrows), "pos3": pd.Series([rd.pos()] * numrows), "pos4": pd.Series([rd.pos()] * numrows) }
df = pd.DataFrame(dt)
# tb = pa.Table.from_pydict(dt)

In [None]:
# print(df.head())
print(df.dtypes)

In [None]:
# check user dataset
prof = pq.ParquetFile("{data_path}/US_EDUC/US_EDUC_user_part_1_0_249.parquet".format(data_path=data_path))
prof0 = prof.read_row_group(0)
prof0.take([237623]).column("user_id")





In [None]:
# test time of one iteration before df assignment
expr = pc.field("user_id") == 600272782
thisuser = prof0.filter(mask = expr, null_selection_behavior = "drop")

test_prof = rd.user()
test_prof.update_value(thisuser.take([0]).to_pandas())



In [None]:
# test df assignment time
id_index = ids.column("user_id").index(3655629).as_py()
print(id_index)
print(df.loc[id_index, "user_prof"])


In [None]:
df.loc[id_index, "user_prof"].update_value(thisuser.take([0]).to_pandas())
print(df.loc[id_index, "user_prof"])

In [None]:
# test time of just df assignment
df.loc[id_index, "user_prof"] = test_prof

In [None]:
prof0.slice(length=5).column("user_id")

In [4]:
# check user_position dataset
pos = pq.ParquetFile("{data_path}/US_EDUC/US_EDUC_user_position_part_1_00_0_249.parquet".format(data_path=data_path))
pos0 = pos.read_row_group(0)
pos0.schema
pos0.num_rows

1048576

In [None]:
print(pos0.slice(length=1).column(0)[0])
print(pos0.column_names[0])
print(pos0.column("user_id")[0])

In [None]:
# pos0.select(["title_raw", "role_k1500", "job_category", "role_k50", "role_k150"])
for (col_name, col_type) in itertools.zip_longest(pos0.schema.names, pos0.schema.types),
    print(col_name,",", col_type)

In [None]:
# check user_skill dataset
skill = pq.ParquetFile("{data_path}/US_EDUC/US_EDUC_user_skill_part_0000_0049.parquet".format(data_path=data_path))
skill0 = skill.read_row_group(0)
skill0.num_columns
skill0.column("user_id")[0]

In [None]:
skill0.slice(length=3)

In [None]:
df = pd.DataFrame({"a",[1,2,3], "b",[2,3,4], "c",pd.Series([pd.NA] * 3)})
print(df)
df.loc[2, "c"] = 10
print(df)

In [None]:
for i in range(3,-1,-1):
    print(i)

In [1]:
# test string slicing
test_str = "12345678.09867"
test_str[0:-6]

'12345678'