# Feature based filtering using tf-idf

In [1]:
# import libraries and datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, vstack, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', None)
pd.set_option('max_colwidth', None)

In [3]:
# load the dataset

behavior = pd.read_csv("data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news = pd.read_csv("data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

### Data Preprosessing

In [4]:
#Fill missing abstracts with placeholder
news['Abstract'].fillna('No abstract available', inplace=True)

# if there are rows with no impressions, drop them
behavior = behavior.dropna(subset=['Impressions']) # this looses some user information, could instead manually overwrite and fill in the missing values based on the typo combining the impression and history columns

### Feature Engineering

In [6]:
#Feature extraction for item vectors

def create_all_item_vectors(news):
    # Text vectorization
    #could be useful to test different stop words and max features
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, norm=None, sublinear_tf=True) 
    # before tuning: accuracy of logistic regression model was 0.70, and precision of 1 was 0.46
    #increased accuracy of logistic regression model by 0.13% and precision of 1 by 0.08%  when increased max features from 1000 to 5000
    #increased accuracy of logistic regression model by 0.02% and precision of 1 by 0.05%  when added sublinear_tf=True and norm=None, but it reduced recall of bu .09%

    title_vectors = tfidf_vectorizer.fit_transform(news['Title'])
    abstract_vectors = tfidf_vectorizer.fit_transform(news['Abstract'])

    # One-hot encoding of categorical variables
    category_vectors = pd.get_dummies(news[['Category', 'Subcategory']]) #tried scaling the category vectors but it reduced the accuracy of the logistic regression model

    # Combine all features, creating sparse item vectors
    item_vectors = hstack([title_vectors, abstract_vectors, category_vectors])
    item_vectors = item_vectors.tocsr()

    # Map NewsID to the index in the item vectors matrix, to allow for quick lookups
    news_id_to_index = {news_id: index for index, news_id in enumerate(news['News ID'])}
    
    return item_vectors, news_id_to_index

item_vectors, news_id_to_index = create_all_item_vectors(news)

In [7]:
# create user vector 
def create_user_vector(user_id, item_vectors, news_id_to_index, behavior):
    user_behavior = behavior[behavior['User ID'] == user_id]

    #check if user has any behavior
    if user_behavior.empty:
        print('No user behavior')
        return None

    # initialize empty weighted vector sum and counter for total weight
    user_vector_sum = csr_matrix((1, item_vectors.shape[1]))
    total_weight = 0

    # base values for the weights
    history_weight = 0.3 
    impression_weight = 1

    # find most recent time 
    most_recent_time = user_behavior['Time'].max()

    # Find indices of the articles in the user's history
    if not user_behavior['History'].isna().all():
        for history in user_behavior['History'].str.split().tolist():
            for item in history:
                if item in news_id_to_index:
                    idx = news_id_to_index[item]
                    user_vector_sum += history_weight * item_vectors[idx]
                    total_weight += history_weight
    
    #find impressions so we can wight the items by the time they were clicked
    for _, row in user_behavior.iterrows():
        time_str = row['Time']
        try:
            impression_time = pd.to_datetime(time_str)
            time_diff = (most_recent_time - impression_time).total_seconds() / 3600 # hours passed
            decay_factor = np.exp(-time_diff / (24 * 7)) #one week half life
        except:
            decay_factor = 1 # if time is not available, assume it was clicked recently, could use average time between clicks instead 

        for impression in row["Impressions"].split(" "):
            article_id, clicked = impression.split("-")
            if clicked == "1" and article_id in news_id_to_index:
                idx = news_id_to_index[article_id]
                weight = impression_weight * decay_factor
                user_vector_sum += weight * item_vectors[idx]
                total_weight += weight
 
    # normalize the user vector
    if total_weight > 0:
        user_vector_sum /= total_weight
    
    return user_vector_sum if total_weight > 0 else None

user_vector = create_user_vector('U13740', item_vectors, news_id_to_index, behavior)

In [8]:
###cosine similarity between user and items
def find_cosine_similarity(user_vector, item_vectors, news_id_to_index):
    # make sure the user vector is a csr matrix
    if not isinstance(user_vector, csr_matrix):
        user_vector = csr_matrix(user_vector)

    cosine_similarities = cosine_similarity(user_vector, item_vectors).flatten()

    # Map the indices to NewsIDs
    news_id_similarity = {news_id: cosine_similarities[idx] for news_id, idx in news_id_to_index.items() if idx < len(cosine_similarities)}

    return news_id_similarity

### find top k articles for a user with rating as well 
def find_top_k_articles(user_id, item_vectors, k, news_id_to_index, behavior):
    # get the user vector
    user_vector = create_user_vector(user_id, item_vectors, news_id_to_index, behavior)

    # find the cosine similarity between the user vector and all item vectors
    news_id_similarity = find_cosine_similarity(user_vector, item_vectors, news_id_to_index)

    # find the top k articles
    top_k_articles = sorted(news_id_similarity.items(), key=lambda x: x[1], reverse=True)[:k]

    return top_k_articles

###this should find user similarity matrix with users and item vectors
## this code takes 50 minutes to run, fyi 
#def create_user_similarity_matrix(behavior, item_vectors):
#    user_ids = behavior['User ID'].unique()
#    user_vectors = []
#    for user_id in user_ids:
#        #print every 1000th user
#        if user_ids.tolist().index(user_id) % 1000 == 0:
#            print(user_id)
#        user_vector = create_user_vector(user_id, item_vectors)
#        if user_vector is not None:
#            user_vectors.append(user_vector)
#    
#    user_vectors = vstack(user_vectors)
#    user_similarity_matrix = cosine_similarity(user_vectors)
#    return user_similarity_matrix

In [9]:
print(find_top_k_articles('U13740', item_vectors, 5, news_id_to_index, behavior))

[('N55189', 0.3841355837068144), ('N55689', 0.36281955783195496), ('N58133', 0.34969751968448914), ('N10414', 0.34912880116767203), ('N45794', 0.3192042228644848)]


### Logistic regression

In [16]:
# Construct dataset for training

def find_clicked_items(behavior):
    # Expand and collect items for each user
    behavior_expanded = behavior.copy()
    behavior_expanded['clicked_items'] = behavior_expanded['Impressions'].apply(
        lambda x: [impression.split("-")[0] for impression in x.split(" ") if impression.split("-")[1] == "1"]
    )
    behavior_expanded['not_clicked_items'] = behavior_expanded['Impressions'].apply(
        lambda x: [impression.split("-")[0] for impression in x.split(" ") if impression.split("-")[1] == "0"]
    )
    behavior_expanded['history_items'] = behavior_expanded['History'].apply(
        lambda x: x.split(" ") if isinstance(x, str) else []
    )

    #aggregate for each user
    aggregate_data = behavior_expanded.groupby('User ID').agg({
        'clicked_items': 'sum',
        'not_clicked_items': 'sum',
        'history_items': 'sum'
    }).reset_index()

    #remove duplicates
    aggregate_data['clicked_items'] = aggregate_data['clicked_items'].apply(lambda x: list(set(x)))
    aggregate_data['not_clicked_items'] = aggregate_data['not_clicked_items'].apply(lambda x: list(set(x)))
    aggregate_data['history_items'] = aggregate_data['history_items'].apply(lambda x: list(set(x)))
    
    return aggregate_data


#prepare the training data for the model
#should not iterate through this many users one by one, but dont have time for paralell processing or batch processig
def prepare_training_data(behavior, item_vectors, news_id_to_index):
    X, y = [], []
    # batch size for stacking
    batch_size = 1000
    # list that will hold the stacked batches
    X_final = []

    # set of all news IDs
    news_set = set(news_id_to_index.keys())

    # find the clicked and ignored items for each user
    user_aggregated_data = find_clicked_items(behavior)

    #comment out the following line to use all users
    user_aggregated_data = user_aggregated_data.head(10 000)

    user_count = 0 # counter for the number of users processed
    for user_data in user_aggregated_data.itertuples(index=False):

        #print every 1000th user
        user_count += 1

        user_id = user_data[0]
        clicked_items = user_data[1]
        not_clicked_items = user_data[2]
        history_items = user_data[3]

        # find the user vector
        user_vector = create_user_vector(user_id, item_vectors, news_id_to_index, behavior)

        if user_vector is None:
            continue
        
        # add the combined user and item vector to the training data
        # history items are treated as clicked items with a rating of 1, could differentiate between clicked and history items
        # but we have so many non-clicked items that it might not be necessary 

        # positive example
        for item in clicked_items + history_items:
            if item in news_set:
                item_index = news_id_to_index[item]
                item_vector = item_vectors[item_index]
                combined_vector = hstack([user_vector, item_vector]).tocsr()
                #this is very slow but doing it all at once crashes the kernel
                X.append(combined_vector)
                y.append(1)

        # negative example
        for item in not_clicked_items:
            if item in news_set:
                item_index = news_id_to_index[item]
                item_vector = item_vectors[item_index]
                combined_vector = hstack([user_vector, item_vector]).tocsr()
                X.append(combined_vector)
                y.append(0)

        # batck stacking to avoid memory issues and reduce processing time
        if user_count % batch_size == 0 or user_count == len(user_aggregated_data):
            if X:
                batch_stack = vstack(X)
                X_final.append(batch_stack)
                X = [] # clear the list
            print(f'Processed and stacked {user_count} users')

    print('Done processing users')

    X_final = vstack(X_final) if X_final else None
    y = np.array(y) if y else None

    return X_final, y


X, y = prepare_training_data(behavior, item_vectors, news_id_to_index)

Processed and stacked 1000 users
Processed and stacked 2000 users
Done processing users


In [17]:
# Set up and train the model

#simple logistic regression model, could try other models
def train_tuning_model(X, y):
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    # should do more to prevent overfitting, but not enough time
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    return model, X_test, y_test

model, X_test, y_test = train_tuning_model(X, y)

# Used for parameter tuning
y_pred = model.predict(X_test)
classification = classification_report(y_test, y_pred)
print(classification)

In [None]:
def train_final_model(X, y):
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X, y)
    return model

#### Some personal testing

In [11]:
#just a test to see if it works

aggregate_data = find_clicked_items(behavior)
user_behavior = aggregate_data[aggregate_data['User ID'] == 'U13740']
clicked_items = user_behavior['clicked_items'].values[0]
not_clicked_items = user_behavior['not_clicked_items'].values[0]
history_items = user_behavior['history_items'].values[0]
cosine_similarities = find_cosine_similarity(user_vector, item_vectors, news_id_to_index)

print(clicked_items)
print(history_items)

#print top 20 values of the cosine similarities
top_recommendations = sorted(cosine_similarities.items(), key=lambda x: x[1], reverse=True)[:20]
print(top_recommendations)

print('Similarity for clicked items:\n')
for item in clicked_items:
    if item in cosine_similarities:
        print(item, cosine_similarities[item])

print('Similarity for history items:\n')
for item in history_items:
    if item in cosine_similarities:
        print(item, cosine_similarities[item])

print('Similarity for not clicked items:\n')
for item in not_clicked_items:
    if item in cosine_similarities:
        print(item, cosine_similarities[item])

#print news rows for clicked items
print('clicked_items: \n')
print(news[news['News ID'].isin(clicked_items)])
#print news rows for history items
print('historic_items: \n')
print(news[news['News ID'].isin(history_items)])

#print news rows for top recommendations
print('top_recommendations: \n')
print(news[news['News ID'].isin([item[0] for item in top_recommendations])])


['N28910', 'N55689', 'N58133']
['N10414', 'N19347', 'N34694', 'N31801', 'N55189', 'N42782', 'N45794', 'N63302', 'N18445']
[('N55189', 0.3841355837068144), ('N55689', 0.36281955783195496), ('N58133', 0.34969751968448914), ('N10414', 0.34912880116767203), ('N45794', 0.3192042228644848), ('N19347', 0.30008150459066246), ('N31801', 0.2818796265077977), ('N63302', 0.274451216821001), ('N28910', 0.27282505503045756), ('N42782', 0.26067007578113904), ('N55161', 0.2576529158009171), ('N47415', 0.23778911423579085), ('N17741', 0.22835923317606924), ('N18445', 0.2272258619248637), ('N59426', 0.20883883060110842), ('N31755', 0.19467886264151046), ('N25821', 0.18402837659498872), ('N42154', 0.1668482861900963), ('N34694', 0.1663599830420959), ('N628', 0.16584849515525646)]
Similarity for clicked items:

N28910 0.27282505503045756
N55689 0.36281955783195496
N58133 0.34969751968448914
Similarity for history items:

N10414 0.34912880116767203
N19347 0.30008150459066246
N34694 0.1663599830420959
N3180

### Old code

In [6]:
#Find category and subcategory of an article from an articleID
def article_tags(articleID):
    tags = []
    article = news.loc[news["News ID"] == articleID]
    tags.append(article["Category"].values[0])
    tags.append(article["Subcategory"].values[0])
    return tags[0], tags[1]

def add_feature_to_xi(x_i, feature, weight):
    if feature in x_i:
        x_i[feature] += weight
    else:
        x_i[feature] = weight

#Create user vector from a userID
def create_x_i(userID):
    x_i = {}

    # Weights
    w_cat = 0.3  
    w_subcat = 0.8
    w_history = 0.5
    w_impression = 1.0

    article_count = 0

    # Add categories and subcategories
    for index, row in behavior.iterrows():
        if row["User ID"] == userID:
            # Adding features from articles in history
            history_count = 0
            # Assuming history starting with oldest click - must be checked, if not, remove reverse operator
            history = row["History"].split(" ")
            history.reverse()
            for articleID in history:
                history_count += 1
                article_count += 1

                category, subcategory = article_tags(articleID)
                add_feature_to_xi(x_i, category, w_cat*w_history/history_count)
                add_feature_to_xi(x_i, subcategory, w_subcat*w_history/history_count)

                # TODO Add entities from articles in history here. We are already iterating through all articles in a users history, need method for extracting entities
                # entities = find_entities(articleID)   -   to be implemented
                # for entity in entities:
                #   add_feature_to_xi(x_i, entity, w_entity*w_history)
    
            # Adding features from articles in impressions
            for impression in row["Impressions"].split(" "):
                if impression.split("-")[1] == "1":
                    article_count += 1

                    category, subcategory = article_tags(impression.split("-")[0])
                    add_feature_to_xi(x_i, category, w_cat*w_impression)
                    add_feature_to_xi(x_i, subcategory, w_subcat*w_impression)

                    # TODO Add entities from articles in impressions here. We are already iterating through all articles in a users impressions, need method for extracting entities
                    # entities = find_entities(articleID)   -   to be implemented
                    # for entity in entities:
                    #   add_feature_to_xi(x_i, entity, w_entity*w_history)

    
    # Normalizing
    for trait, score in x_i.items():
        x_i[trait] = score/article_count

    return x_i

exID = behavior["User ID"].values[1]

x_i = create_x_i(exID)
print(x_i)

{'lifestyle': 0.010714285714285714, 'lifestyleroyals': 0.028571428571428574, 'health': 0.005357142857142857, 'fitness': 0.014285714285714287, 'finance': 0.008027597402597402, 'finance-companies': 0.021406926406926405, 'weather': 0.0026785714285714286, 'weathertopstories': 0.0071428571428571435, 'news': 0.004871794871794871, 'newscrime': 0.0047619047619047615, 'music': 0.001530612244897959, 'music-celebrity': 0.004081632653061225, 'newsus': 0.005372405372405373, 'newsscienceandtechnology': 0.002857142857142857, 'travel': 0.0008928571428571428, 'travelnews': 0.0023809523809523807, 'sports': 0.02142857142857143, 'football_nfl': 0.05714285714285715}


In [3]:
def create_all_x_j():
    all_x_j = []
    for index, row in news.iterrows():
        x_j = {}
        x_j["News ID"] = row["News ID"]
        x_j[row["Category"]] = 1.0
        x_j[row["Subcategory"]] = 1.0
        all_x_j.append(x_j)

        # TODO add entities to item vector x_j

    return all_x_j

X_j = create_all_x_j()
print(X_j[1])

{'News ID': 'N18955', 'health': 1.0, 'medical': 1.0}


In [8]:
def score(userID):
    x_i = create_x_i(userID)
    X_j = create_all_x_j()

    scores = {}

    for x_j in X_j:
        for c, s in x_j.items():
            article_score = 0
            if c in x_i.keys():
                article_score += s * x_i[c]
        scores[x_j["News ID"]] = article_score
    
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:10]

print(score(exID))

[('N2073', 0.05714285714285715), ('N16587', 0.05714285714285715), ('N29120', 0.05714285714285715), ('N64723', 0.05714285714285715), ('N27190', 0.05714285714285715), ('N9035', 0.05714285714285715), ('N41277', 0.05714285714285715), ('N42921', 0.05714285714285715), ('N19888', 0.05714285714285715), ('N27334', 0.05714285714285715)]
