In [1]:
%%capture --no-stdout
%reload_ext watermark
%watermark -uniz --author "Prayson W. Daniel" -vm -p duckdb,polars,scikit-learn

Author: Prayson W. Daniel

Last updated: 2024-11-12T15:43:48.982864+01:00

Python implementation: CPython
Python version       : 3.11.10
IPython version      : 8.29.0

duckdb      : 1.1.3
polars      : 1.13.0
scikit-learn: 1.5.2

Compiler    : Clang 15.0.0 (clang-1500.3.9.4)
OS          : Darwin
Release     : 23.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 16
Architecture: 64bit



In [2]:
%cd ..

/Users/pwd/Codes/hadithi/dev.io/knowledge


In [3]:
import duckdb
import polars as pl

In [5]:
ratings = (
    duckdb.sql(
        """
        SELECT 
        *
        FROM read_csv('data/ratings/*.csv', union_by_name=True) 
        WHERE Rating NOT LIKE '%This user%' 
        """
        
    )
).pl()

ratings.head(3)

ID,Name,Rating
i64,str,str
1,"""Agile Web Development with Rai…","""it was amazing"""
1,"""The Restaurant at the End of t…","""it was amazing"""
1,"""Siddhartha""","""it was amazing"""


In [6]:
ratings.group_by("Rating").len()

Rating,len
str,u32
"""it was ok""",28811
"""really liked it""",132808
"""it was amazing""",92354
"""liked it""",96047
"""did not like it""",7811


In [7]:
category = {
    "did not like it": 1,
    "it was ok": 2,
    "liked it": 3,
    "really liked it": 4,
    "it was amazing": 5,
}

In [8]:
ratings.with_columns(
    pl.col("Rating").replace_strict(category)
)


ID,Name,Rating
i64,str,i64
1,"""Agile Web Development with Rai…",5
1,"""The Restaurant at the End of t…",5
1,"""Siddhartha""",5
1,"""The Clock of the Long Now: Tim…",4
1,"""Ready Player One (Ready Player…",4
…,…,…
10978,"""The Foundation: A Great Americ…",3
10986,"""Cosette: The Sequel to Les Mis…",4
10986,"""J. D. Salinger's The Catcher i…",5
10988,"""Facing the Lion: Growing Up Ma…",3


In [9]:
THRESHOLD: int = 10

USERS = set(
    ratings
    .group_by("ID").len()
    # .sort(by="len", descending=True)
    .filter(pl.col("len").ge(THRESHOLD))
    ["ID"].to_list()
)

In [10]:
data = (
    ratings
    .filter(pl.col("ID").is_in(USERS))
    .with_columns(
    pl.col("Rating").replace_strict(category)
    )
    .pivot(
        values="Rating",
        index="Name",
        on="ID",
        aggregate_function="first"
    )
    .fill_null(0)
    .to_pandas()
    .set_index("Name")
    
)

data

Unnamed: 0_level_0,1,2,3,5,6,7,8,9,12,14,...,6822,9196,10391,8936,10100,7921,8989,7391,8057,9127
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agile Web Development with Rails: A Pragmatic Guide,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"The Restaurant at the End of the Universe (Hitchhiker's Guide to the Galaxy, #2)",5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Siddhartha,5,0,0,5,0,4,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
The Clock of the Long Now: Time and Responsibility,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Ready Player One (Ready Player One, #1)",4,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Age of Kali: Indian Travels & Encounters,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Story-Wallah: Short Fiction from South Asian Writers,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Foundation: A Great American Secret: How Private Wealth Is Changing the World,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Facing the Lion: Growing Up Maasai on the African Savanna,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.neighbors import NearestNeighbors

In [12]:
params = {
    "n_neighbors": 6,
    "radius": 1.0,
    "metric": "cosine",
    "n_jobs":-1,
}

knn = NearestNeighbors(**params)
knn

In [13]:
knn.fit(data.values)

In [14]:
[distance] , [idx] = knn.kneighbors(X=[data.loc["Brave New World"].values],  n_neighbors=10)

In [15]:
with pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000):
   
    print(pl.DataFrame({"book": data.iloc[idx].index.values,
                  "distance": distance}))

shape: (10, 2)
┌────────────────────────┬──────────┐
│ book                   ┆ distance │
│ ---                    ┆ ---      │
│ str                    ┆ f64      │
╞════════════════════════╪══════════╡
│ Brave New World        ┆ 0.0      │
│ 1984                   ┆ 0.472723 │
│ The Great Gatsby       ┆ 0.517056 │
│ The Catcher in the Rye ┆ 0.533917 │
│ To Kill a Mockingbird  ┆ 0.54796  │
│ Animal Farm            ┆ 0.553966 │
│ Slaughterhouse-Five    ┆ 0.566514 │
│ Of Mice and Men        ┆ 0.580072 │
│ Lord of the Flies      ┆ 0.591027 │
│ Fahrenheit 451         ┆ 0.634567 │
└────────────────────────┴──────────┘


In [16]:
with pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000):
    print(ratings
         .select("Name")
         .unique(keep="first")
         .filter(pl.col("Name").str.contains("Brave New"))
        )

shape: (10, 1)
┌──────────────────────────────────────────────────────────────────────────────┐
│ Name                                                                         │
│ ---                                                                          │
│ str                                                                          │
╞══════════════════════════════════════════════════════════════════════════════╡
│ Brave New World Revisited                                                    │
│ Frankenstein's Cat: Cuddling Up to Biotech's Brave New Beasts                │
│ Brave New World                                                              │
│ Brave New Worlds                                                             │
│ Brave New World / Brave New World Revisited                                  │
│ Brave New Girl                                                               │
│ Intern Nation: How to Earn Nothing and Learn Little in the Brave New Economy │
│ Strangers i