In [64]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from pprint import pprint

interactions_file = r'C:\Users\spenc\Documents\IS455\Homeworks\ModelingProject\users_interactions.csv'
articles_file = r'C:\Users\spenc\Documents\IS455\Homeworks\ModelingProject\shared_articles.csv'

In [65]:
# Load articles (analogous to movieDF)
# Set contentId as index
articlesDF = pd.read_csv(articles_file, usecols=['contentId', 'title', 'text', 'lang'], index_col='contentId')

# Load interactions (analogous to ratingDF)
ratingDF = pd.read_csv(interactions_file, usecols=['personId', 'contentId', 'eventType', 'timestamp'])

# --- Preprocessing Steps Specific to this Dataset ---
# Map eventType to numerical ratings
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0,
   'BOOKMARK': 3.0,
   'FOLLOW': 4.0,
   'COMMENT CREATED': 5.0,
}
ratingDF['rating'] = ratingDF['eventType'].map(event_type_strength)
ratingDF.dropna(subset=['rating'], inplace=True)
# Keep highest rating per user-item, latest timestamp wins ties
ratingDF = ratingDF.sort_values('timestamp', ascending=False)
ratingDF = ratingDF.loc[ratingDF.groupby(['personId', 'contentId'])['rating'].idxmax()]
# --- End Preprocessing Steps ---

# Rename columns to match class notebook (userId, movieId)
ratingDF.rename(columns={'personId': 'userId', 'contentId': 'movieId'}, inplace=True)
# Reorder columns slightly to match class notebook display if desired
ratingDF = ratingDF[['userId', 'movieId', 'rating', 'timestamp']] # Drop eventType

# Display head (like ratingDF.head())
print(ratingDF.head())


                    userId              movieId  rating   timestamp
9221  -9223121837663643404 -8949113594875411859     1.0  1462452127
53449 -9223121837663643404 -8377626164558006982     1.0  1473938707
38189 -9223121837663643404 -8208801367848627943     1.0  1469706702
23975 -9223121837663643404 -8187220755213888616     1.0  1467823897
63834 -9223121837663643404 -7423191370472335463     1.0  1479376578


In [66]:
print('Min:\t\t', ratingDF['movieId'].value_counts().min())
print('Quartile 1:\t', ratingDF['movieId'].value_counts().quantile(.25))
print('Median:\t\t', ratingDF['movieId'].value_counts().quantile(.5))
print('Quartile 3:\t', ratingDF['movieId'].value_counts().quantile(.75))
print('Max:\t\t', ratingDF['movieId'].value_counts().max())


Min:		 1
Quartile 1:	 4.0
Median:		 9.0
Quartile 3:	 17.0
Max:		 268


In [67]:
min_interactions_threshold = 5 # Using a more descriptive name, adjust value as needed

value_counts = ratingDF['movieId'].value_counts()
keep_list = value_counts[value_counts >= min_interactions_threshold] # Adjusted var name
df_ratings_reduced = ratingDF.loc[ratingDF['movieId'].isin(keep_list.index)].copy() # Use copy to avoid SettingWithCopyWarning
print(df_ratings_reduced) # Print the df


                    userId              movieId  rating   timestamp
9221  -9223121837663643404 -8949113594875411859     1.0  1462452127
53449 -9223121837663643404 -8377626164558006982     1.0  1473938707
38189 -9223121837663643404 -8208801367848627943     1.0  1469706702
23975 -9223121837663643404 -8187220755213888616     1.0  1467823897
63834 -9223121837663643404 -7423191370472335463     1.0  1479376578
...                    ...                  ...     ...         ...
68297  9210530975708218054  8477804012624580461     4.0  1486577729
65892  9210530975708218054  8526042588044002101     1.0  1482887760
60446  9210530975708218054  8856169137131817223     1.0  1476790903
65179  9210530975708218054  8869347744613364434     1.0  1481294993
57936  9210530975708218054  9209886322932807692     1.0  1477409052

[38567 rows x 4 columns]


In [68]:
U = df_ratings_reduced['userId'].nunique()
I = df_ratings_reduced['movieId'].nunique()

# Map existing user and item IDs to set of consecutive integers
user_mapper = dict(zip(np.unique(df_ratings_reduced['userId']), list(range(U))))
item_mapper = dict(zip(np.unique(df_ratings_reduced['movieId']), list(range(I))))

# Reverse mapping for later retreval of reccomended items
user_inv_mapper = dict(zip(list(range(U)), np.unique(df_ratings_reduced['userId'])))
item_inv_mapper = dict(zip(list(range(I)), np.unique(df_ratings_reduced['movieId'])))

# Create a list of user ID and item IDs for every time a rating appears
user_index = [user_mapper[i] for i in df_ratings_reduced['userId']]
item_index = [item_mapper[i] for i in df_ratings_reduced['movieId']]

X = csr_matrix((df_ratings_reduced['rating'], (item_index,user_index)), shape=(I,U))
print(X) # Print matrix info like class nb


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 38567 stored elements and shape (2109, 1885)>
  Coords	Values
  (0, 225)	2.0
  (0, 332)	4.0
  (0, 828)	5.0
  (0, 1135)	5.0
  (0, 1857)	2.0
  (1, 2)	2.0
  (1, 130)	1.0
  (1, 209)	1.0
  (1, 553)	1.0
  (1, 935)	1.0
  (1, 940)	2.0
  (1, 1142)	1.0
  (1, 1228)	5.0
  (1, 1491)	1.0
  (1, 1524)	1.0
  (2, 40)	1.0
  (2, 159)	1.0
  (2, 197)	1.0
  (2, 333)	1.0
  (2, 628)	1.0
  (2, 649)	1.0
  (2, 817)	5.0
  (2, 902)	1.0
  (2, 930)	2.0
  (2, 997)	1.0
  :	:
  (2107, 796)	1.0
  (2107, 904)	1.0
  (2107, 972)	1.0
  (2107, 1072)	1.0
  (2107, 1129)	1.0
  (2107, 1214)	1.0
  (2107, 1241)	1.0
  (2107, 1256)	2.0
  (2107, 1312)	1.0
  (2107, 1327)	1.0
  (2107, 1336)	1.0
  (2107, 1376)	2.0
  (2107, 1397)	1.0
  (2107, 1398)	1.0
  (2107, 1519)	1.0
  (2107, 1589)	1.0
  (2107, 1719)	1.0
  (2107, 1824)	1.0
  (2107, 1864)	1.0
  (2107, 1878)	1.0
  (2108, 170)	1.0
  (2108, 586)	1.0
  (2108, 997)	1.0
  (2108, 1230)	1.0
  (2108, 1589)	1.0


In [69]:
def create_matrix(df, user, item, rating):
    # import numpy as np # Assume imported globally
    # from scipy.sparse import csr_matrix # Assume imported globally

    U = df[user].nunique()  # Number of users for the matrix
    I = df[item].nunique()  # Number of items for the matrix

    # Map user and movie IDs to matrix indices
    user_mapper = dict(zip(np.unique(df[user]), list(range(U))))
    item_mapper = dict(zip(np.unique(df[item]), list(range(I))))

    # Map matrix indices back to IDs
    user_inv_mapper = dict(zip(list(range(U)), np.unique(df[user])))
    item_inv_mapper = dict(zip(list(range(I)), np.unique(df[item])))

    # Create a list of index values for the csr_matrix for users and movies
    user_index = [user_mapper[i] for i in df[user]]
    item_index = [item_mapper[i] for i in df[item]]

    # Build the final matrix which will look like: (movieId, userId) rating
    # Note: Class nb used (item_index, user_index) which is Items x Users
    X = csr_matrix((df[rating], (item_index, user_index)), shape=(I, U))

    return X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper

In [70]:
X_func, user_mapper_func, item_mapper_func, user_inv_mapper_func, item_inv_mapper_func = create_matrix(
    df_ratings_reduced, 'userId', 'movieId', 'rating'
)

# Reassign global variables to the output of the function for consistency
X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = \
    X_func, user_mapper_func, item_mapper_func, user_inv_mapper_func, item_inv_mapper_func

# Print the tuple directly like class nb cell 8 output
print((X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper))


(<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 38567 stored elements and shape (2109, 1885)>, {-9223121837663643404: 0, -9212075797126931087: 1, -9207251133131336884: 2, -9199575329909162940: 3, -9196668942822132778: 4, -9188188261933657343: 5, -9172914609055320039: 6, -9156344805277471150: 7, -9150583489352258206: 8, -9120685872592674274: 9, -9109785559521267180: 10, -9099478998637725255: 11, -9083704948999852989: 12, -9063420486253202900: 13, -9060214117327732109: 14, -9048557723087354030: 15, -9047547311469006438: 16, -9016528795238256703: 17, -9012030317377670760: 18, -9009798162809551896: 19, -9001583565812478106: 20, -8994220765455693336: 21, -8985529623369322698: 22, -8984152171875293281: 23, -8965597312806628567: 24, -8965390025652957482: 25, -8961797610770561652: 26, -8909668725653743114: 27, -8891033171626175843: 28, -8860671864164757449: 29, -8859417914913317336: 30, -8854674432071487111: 31, -8853658195208337106: 32, -8845298781299428018: 33, -88302500907363

In [71]:
from sklearn.neighbors import NearestNeighbors # Keep import here like class nb

def recommend(itemId, X_matrix, item_mapper_dict, item_inv_mapper_dict, k, metric='cosine', show_messages=True):
    # from sklearn.neighbors import NearestNeighbors # Moved outside

    rec_ids = []                # Make a list for the recommended item IDs we'll get later
    # Assuming itemId exists in mapper, like class nb implicitly does
    item = item_mapper_dict[itemId]  # Get the index of the movie ID passed into the function
    item_vector = X_matrix[item]       # Get the vector of user ratings for the movie ID passed into the function

    # Ensure item_vector is 2D
    if isinstance(item_vector, csr_matrix): # Keep this necessary check
        item_vector = item_vector.reshape(1, -1)

    # Fit the clustering algorithm based on the user-item matrix X (fit inside function like class nb)
    knn = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    knn.fit(X_matrix)

    # Call the trained knn cluster model to return the nearest neighbors of the item_vector passed in
    distances, indices = knn.kneighbors(item_vector) # Renamed from 'rec'

    rec_indeces = indices[0]     # Parse out the list of indeces of the recommended items
    rec_distances = distances[0]   # Parse out the recommendation strength

    # Remove distance to self for final output
    final_rec_distances = np.delete(rec_distances, 0)

    # We need to replace the recommended item indeces with their original item IDs
    for i in range(1, k + 1): # Use k+1 in range to match logic of class nb loop limit
        rec_ids.append(item_inv_mapper_dict[rec_indeces[i]])

    if show_messages: # Renamed messages -> show_messages for clarity
        print(f'List of recommended item indeces:\n{rec_indeces}\n')
        print(f'List of recommended item IDs:\n{rec_ids}\n')
        print(f'List of recommended item similarity to selected item:\n{final_rec_distances}\n')

    # Return two lists: the original item IDs of the recommendations and their similarity scores
    return rec_ids, final_rec_distances

# Test Recommendation Function
# Use a valid article/movie id here to get recommendations
# Find one programmatically from the reduced set
valid_ids = list(item_mapper.keys())
movie_id = valid_ids[0] if valid_ids else None # Use first valid ID if available
# Or set manually if known: movie_id = -4029704725707465084

if movie_id is not None:
    rec_ids, rec_distances = recommend(movie_id, X, item_mapper, item_inv_mapper, k=5, metric='cosine', show_messages=True)

    # Use articlesDF (analogous to movieDF) for title lookup
    # Ensure articlesDF has movieId (contentId) as index
    try:
        source_title = articlesDF.loc[movie_id, 'title']
        print(f"If you liked {source_title}, you might also like:")
        # Print the series of titles directly like class nb
        print(articlesDF.loc[rec_ids, 'title'])
    except KeyError:
         print(f"Could not find title for source movie ID: {movie_id}")
         # Still try to print recommended titles
         try:
             print(articlesDF.loc[rec_ids, 'title'])
         except KeyError:
             print("Could not retrieve titles for recommended items.")
else:
    print("No valid movie IDs found in the reduced dataset to test recommendations.")


List of recommended item indeces:
[   0 2102  239  562 1318 1634]

List of recommended item IDs:
[9209629151177723638, -7174595923544719736, -4293262855938209141, 2541499697677936774, 5152069678055228801]

List of recommended item similarity to selected item:
[0.27217466 0.29089605 0.31426601 0.39969977 0.44156441]

If you liked Uber lança serviço de helicóptero em SP com preços a partir de R$66 - IDG Now!, you might also like:
contentId
 9209629151177723638    Google search now provides access to basic hea...
-7174595923544719736                    Learning at the speed of business
-4293262855938209141    Globant Acquires L4 to Strengthen its Lead Pos...
 2541499697677936774         In defense of the Uber-ization of everything
 5152069678055228801       IBM makes a big shift into cognitive computing
Name: title, dtype: object


In [None]:
# Cell 10: DEPLOYMENT Example - Pre-calculate recommendations
print("\n--- Simulating Deployment: Pre-calculating All Recommendations ---")
k = 5 # Use k consistent with class nb example
# articlesDF index was set earlier

# Fit the model once for the deployment example loop
print(f"\n--- Fitting KNN model once for deployment example (k={k})... ---")
final_knn_model = NearestNeighbors(n_neighbors=k + 1, algorithm="brute", metric='cosine')
final_knn_model.fit(X)
print("Model fitted.")

df_reccomendations = pd.DataFrame(columns=['If you liked'], index=item_mapper.keys()) # Use item_mapper keys for index
for i in range(1,k+1):
    df_reccomendations[f"Reccomendation {i}"] = None

print(f"\n--- Generating top {k} recommendations for all {len(item_mapper)} items... ---")
# Iterate using df_reccomendations index (which are the original movieIds)
for row_index in df_reccomendations.index: # row_index is the movieId (contentId)
    # --- Replicate recommend logic using pre-fitted model ---
    item = item_mapper[row_index]
    item_vector = X[item].reshape(1, -1)
    distances, indices = final_knn_model.kneighbors(item_vector)
    rec_indeces = indices[0]
    rec_ids = []
    for i in range(1, k + 1):
        rec_ids.append(item_inv_mapper[rec_indeces[i]])
    # --- End replication ---

    # Use .at for setting scalar values, ensure index exists in articlesDF
    try:
        # Get the title first
        source_title = articlesDF.at[row_index, 'title']
        # Then set it using .at
        df_reccomendations.at[row_index, 'If you liked'] = source_title
    except KeyError:
        df_reccomendations.at[row_index, 'If you liked'] = f"ID: {row_index} (Title N/A)" # Use .at here too

    for i, r in enumerate(rec_ids):
        try:
            # Get the recommendation title first
            rec_title = articlesDF.at[r, 'title']
            # Then set it using .at
            df_reccomendations.at[row_index, f"Reccomendation {i+1}"] = rec_title
        except KeyError:
            df_reccomendations.at[row_index, f"Reccomendation {i+1}"] = f"ID: {r} (Title N/A)" # Use .at here too

print("\nPre-calculation complete. Sample:")
print(df_reccomendations.head(10)) # Display head like class nb


--- Fitting KNN model once for deployment example (k=5)... ---
Model fitted.

--- Generating top 5 recommendations for all 2109 items... ---


ValueError: Incompatible indexer with Series