In [1]:
%%capture --no-stdout
%reload_ext watermark
%watermark -uniz --author "Prayson W. Daniel" -vm -p duckdb,polars,scikit-learn,altair

Author: Prayson W. Daniel

Last updated: 2024-11-12T19:16:47.271266+01:00

Python implementation: CPython
Python version       : 3.11.10
IPython version      : 8.29.0

duckdb      : 1.1.3
polars      : 1.13.0
scikit-learn: 1.5.2
altair      : 5.4.1

Compiler    : Clang 15.0.0 (clang-1500.3.9.4)
OS          : Darwin
Release     : 23.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 16
Architecture: 64bit



In [2]:
%cd ..

/Users/pwd/Codes/hadithi/dev.io/knowledge


In [3]:
import duckdb
import polars as pl
from sklearn.neighbors import NearestNeighbors

In [4]:
ratings = (
    duckdb.sql(
        """
        SELECT 
        *
        FROM read_csv('data/ratings/*.csv', union_by_name=True) 
        WHERE Rating NOT LIKE '%This user%' 
        """
        
    )
).pl()

ratings.head(3)

ID,Name,Rating
i64,str,str
1,"""Agile Web Development with Rai…","""it was amazing"""
1,"""The Restaurant at the End of t…","""it was amazing"""
1,"""Siddhartha""","""it was amazing"""


In [49]:
(
    ratings
        .group_by("Rating")
        .agg(pl.count("Rating").alias("Count"))
        .sort("Count", descending=True)
        .plot.bar(
        x="Count",
        y="Rating",
        )
        .properties(
            width=400,
            height=200,
            title="Distribution"
        )

)

In [None]:
category = {
    "did not like it": 1 ,
    "it was ok": 2,
    "liked it": 3,
    "really liked it": 4,
    "it was amazing": 5,
}

category_normalized = {key: (value - 1)/(5-1) for key, value in category.items()} # MinMax (Not Required for cosine similarity)

In [None]:
category_normalized

In [None]:
ratings.with_columns(
    pl.col("Rating").replace_strict(category_normalized)
)


In [None]:
THRESHOLD: int = 50

USERS = set(
    ratings
    .group_by("ID").len()
    # .sort(by="len", descending=True)
    .filter(pl.col("len").ge(THRESHOLD))
    ["ID"].to_list()
)

In [None]:
data = (
    ratings
    .filter(pl.col("ID").is_in(USERS))
    .with_columns(
    pl.col("Rating").replace_strict(category_normalized)
    )
    .pivot(
        values="Rating",
        index="Name",
        on="ID",
        aggregate_function="first"
    )
    .fill_null(0.0)
    .to_pandas()
    .set_index("Name")
    
)

data

In [None]:
params = {
    "n_neighbors": 7,
    "radius": 1,
    "metric": "cosine",
    "n_jobs":-1,
}

knn = NearestNeighbors(**params)
knn

In [None]:
knn.fit(data.values)

In [None]:
[distance] , [idx] = knn.kneighbors(X=[data.loc["Brave New World"].values],  n_neighbors=10)

In [None]:
with pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000):
   
    print(pl.DataFrame({"book": data.iloc[idx].index.values,
                  "distance": distance}))

In [None]:
[distance] , [idx] = knn.kneighbors(X=[data.loc["The Little Prince"].values],  n_neighbors=10)
with pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000):
   
    print(pl.DataFrame({"book": data.iloc[idx].index.values,
                  "distance": distance}))

In [None]:
with pl.Config(fmt_str_lengths=1000, tbl_width_chars=1000):
    print(ratings
         .select("Name")
         .unique(keep="first")
         .filter(pl.col("Name").str.contains("The Alchemist"))
        )