In [1]:
from scipy import spatial
import pandas as pd

df = pd.DataFrame(data={'c1': [1, 2, 2], 'c2': [3, 4, 1]})

f1 = df.iloc[0, :]
f2 = df.iloc[1, :]

# compute the cosine similarity between the first 2 rows
cosine_sim = 1 - spatial.distance.cosine(f1, f2)
print(cosine_sim)

0.9899494936611665


In [2]:
import numpy as np

df = pd.DataFrame(data={'user': [1, 1, 2, 2], 'rating': [3, 4, 1, 2]})

def normalize_ratings(df, rating_col="rating", user_col="user"):
    groups = df.groupby(user_col)[rating_col]
    # computes group-wise mean/std and auto broadcasts to individual groups
    mean = groups.transform(np.mean)
    std = groups.transform(np.std)
    return (df[rating_col] - mean) / std

df["rating_normalized"] = normalize_ratings(df)
df.head()

Unnamed: 0,user,rating,rating_normalized
0,1,3,-0.707107
1,1,4,0.707107
2,2,1,-0.707107
3,2,2,0.707107


In [3]:
import numpy as np

df = pd.DataFrame(data={'user': [1, 1, 2, 2], 'rating': [10, 10, 10, 10], 't': pd.to_datetime(["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"])})

def cumsum_days(s, duration='D'):
    return s.diff().astype('timedelta64[%s]' % duration).fillna(0).cumsum().values

def decay_ratings(df, decay=1, rating_col="rating", time_col="t"):
    weight = np.exp(-cumsum_days(df[time_col]) * decay)
    return df[rating_col] * weight

half_life_t = 1
df["rating_decayed"] = decay_ratings(df, decay=np.log(2)/half_life_t)

df.head()

Unnamed: 0,user,rating,t,rating_decayed
0,1,10,2019-01-01,10.0
1,1,10,2019-01-02,5.0
2,2,10,2019-01-03,2.5
3,2,10,2019-01-04,1.25
