In [1]:
%%capture --no-stdout
%reload_ext watermark
%watermark -uniz --author "Prayson W. Daniel" -vm -p duckdb,polars,scikit-learn,altair

Author: Prayson W. Daniel

Last updated: 2024-11-18T10:30:30.003782+01:00

Python implementation: CPython
Python version       : 3.11.10
IPython version      : 8.29.0

duckdb      : 1.1.3
polars      : 1.13.0
scikit-learn: 1.5.2
altair      : 5.4.1

Compiler    : Clang 15.0.0 (clang-1500.3.9.4)
OS          : Darwin
Release     : 23.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 16
Architecture: 64bit



In [2]:
%cd ..

/Users/pwd/Codes/hadithi/dev.io/knowledge


In [3]:
import duckdb
import polars as pl
from sklearn.neighbors import NearestNeighbors

In [4]:
ratings = (
    duckdb.sql(
        """
        SELECT 
        *
        FROM read_csv('data/ratings/*.csv', union_by_name=True) 
        WHERE Rating NOT LIKE '%This user%' 
        """
        
    )
).pl()

ratings.head(3)

ID,Name,Rating
i64,str,str
1,"""Agile Web Development with Rai…","""it was amazing"""
1,"""The Restaurant at the End of t…","""it was amazing"""
1,"""Siddhartha""","""it was amazing"""


In [5]:
(
    ratings
        .group_by("Rating")
        .agg(pl.count("Rating").alias("Count"))
        .sort("Count", descending=True)
        .plot.bar(
        x="Count",
        y="Rating",
        )
        .properties(
            width=400,
            height=200,
            title="Distribution"
        )

)

In [6]:
category = {
    "did not like it": 1 ,
    "it was ok": 2,
    "liked it": 3,
    "really liked it": 4,
    "it was amazing": 5,
}

category_normalized = {key: (value - 1)/(5-1) for key, value in category.items()} # MinMax (Not Required for cosine similarity)



In [7]:
category_normalized

{'did not like it': 0.0,
 'it was ok': 0.25,
 'liked it': 0.5,
 'really liked it': 0.75,
 'it was amazing': 1.0}

In [8]:
ratings.with_columns(
    pl.col("Rating").replace_strict(category_normalized)
)


ID,Name,Rating
i64,str,f64
1,"""Agile Web Development with Rai…",1.0
1,"""The Restaurant at the End of t…",1.0
1,"""Siddhartha""",1.0
1,"""The Clock of the Long Now: Tim…",0.75
1,"""Ready Player One (Ready Player…",0.75
…,…,…
10978,"""The Foundation: A Great Americ…",0.5
10986,"""Cosette: The Sequel to Les Mis…",0.75
10986,"""J. D. Salinger's The Catcher i…",1.0
10988,"""Facing the Lion: Growing Up Ma…",0.5


In [9]:
THRESHOLD: int = 50

USERS = set(
    ratings
    .group_by("ID").len()
    # .sort(by="len", descending=True)
    .filter(pl.col("len").ge(THRESHOLD))
    ["ID"].to_list()
)

In [10]:
data = (
    ratings
    .filter(pl.col("ID").is_in(USERS))
    .with_columns(
    pl.col("Rating").replace_strict(category_normalized)
    )
    .pivot(
        values="Rating",
        index="Name",
        on="ID",
        aggregate_function="first"
    )
    .fill_null(0.0)
    .to_pandas()
    .set_index("Name")
    
)

data

Unnamed: 0_level_0,1,2,5,8,9,14,15,18,21,26,...,10519,7211,9818,9708,10974,10296,10906,7907,9196,8989
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agile Web Development with Rails: A Pragmatic Guide,1.00,0.00,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"The Restaurant at the End of the Universe (Hitchhiker's Guide to the Galaxy, #2)",1.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Siddhartha,1.00,0.00,1.0,0.0,0.0,1.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Clock of the Long Now: Time and Responsibility,0.75,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Ready Player One (Ready Player One, #1)",0.75,0.75,0.0,0.0,0.0,0.0,0.0,0.50,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Tenth Parallel: Dispatches from the Fault Line Between Christianity and Islam,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
India After Gandhi: The History of the World's Largest Democracy,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Age of Kali: Indian Travels & Encounters,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Story-Wallah: Short Fiction from South Asian Writers,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
params = {
    "n_neighbors": 7,
    "radius": 1,
    "metric": "cosine",
    "n_jobs":-1,
}

knn = NearestNeighbors(**params)
knn

In [12]:
knn.fit(data.values)

In [13]:
[distance] , [idx] = knn.kneighbors(X=[data.loc["Brave New World"].values],  n_neighbors=10)

In [14]:
with pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000):
   
    print(pl.DataFrame({"book": data.iloc[idx].index.values,
                  "distance": distance}))

shape: (10, 2)
┌────────────────────────┬──────────┐
│ book                   ┆ distance │
│ ---                    ┆ ---      │
│ str                    ┆ f64      │
╞════════════════════════╪══════════╡
│ Brave New World        ┆ 0.0      │
│ 1984                   ┆ 0.439308 │
│ The Great Gatsby       ┆ 0.499797 │
│ The Catcher in the Rye ┆ 0.502797 │
│ Animal Farm            ┆ 0.519706 │
│ Slaughterhouse-Five    ┆ 0.533148 │
│ To Kill a Mockingbird  ┆ 0.537737 │
│ Lord of the Flies      ┆ 0.54628  │
│ Of Mice and Men        ┆ 0.577867 │
│ Fahrenheit 451         ┆ 0.596187 │
└────────────────────────┴──────────┘


In [15]:
[distance] , [idx] = knn.kneighbors(X=[data.loc["The Little Prince"].values],  n_neighbors=10)
with pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000):
   
    print(pl.DataFrame({"book": data.iloc[idx].index.values,
                  "distance": distance}))

shape: (10, 2)
┌────────────────────────────────────┬──────────┐
│ book                               ┆ distance │
│ ---                                ┆ ---      │
│ str                                ┆ f64      │
╞════════════════════════════════════╪══════════╡
│ The Little Prince                  ┆ 0.0      │
│ The Catcher in the Rye             ┆ 0.579896 │
│ Animal Farm                        ┆ 0.594153 │
│ Franny and Zooey                   ┆ 0.613343 │
│ One Hundred Years of Solitude      ┆ 0.622968 │
│ Jonathan Livingston Seagull        ┆ 0.625331 │
│ The Giving Tree                    ┆ 0.626296 │
│ The Alchemist                      ┆ 0.642426 │
│ The Adventures of Huckleberry Finn ┆ 0.644822 │
│ The Metamorphosis                  ┆ 0.645413 │
└────────────────────────────────────┴──────────┘


In [16]:
with pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000):
    print(ratings
         .select("Name")
         .unique(keep="first")
         .filter(pl.col("Name").str.contains("The Alchemist"))
        )

shape: (6, 1)
┌─────────────────────────────────────────────────────────┐
│ Name                                                    │
│ ---                                                     │
│ str                                                     │
╞═════════════════════════════════════════════════════════╡
│ The Alchemist: A Graphic Novel                          │
│ The Alchemist                                           │
│ The Alchemist's Daughter                                │
│ The Alchemist, and, The Executioness                    │
│ The Alchemist's Secret (Ben Hope #1)                    │
│ The Alchemist's Daughter (Bianca Goddard Mysteries, #1) │
└─────────────────────────────────────────────────────────┘
