# News Recommender System

This a Google Colab for our project for the AI Course at UCU, 2021.

**Authors**: Dmytro Lopushanskyy, Volodymyr Savchuk.

The report for this project will be attached separately on CMS.

Here is a list of materials that helped us create this project:

* [MIND Data set](https://msnews.github.io/)
* [Build Recommendation Engine](https://realpython.com/build-recommendation-engine-collaborative-filtering/)
* [Recommender Systems in Python](https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101#Recommender-Systems-in-Python-101)
* [MIND Recommendation Notebook](https://www.kaggle.com/accountstatus/mind-microsoft-news-recommendation-v2/notebook#Text-Preprocessing)
* [Evaluating Recommender Systems](http://fastml.com/evaluating-recommender-systems/)

## Imports

In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [4]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/vozak16/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vozak16/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/vozak16/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Loading Data

In [40]:
filtered_behaviors = pd.read_csv('files/filtered_behaviours.csv', sep='\t')

filtered_articles = pd.read_csv('files/filtered_articles.csv', sep='\t')

behaviours_train_indexed_df = pd.read_csv('files/train_filtered_behaviours.csv', sep='\t')
behaviours_test_indexed_df = pd.read_csv('files/test_filtered_behaviours.csv', sep='\t')

## Text Preprocessing

In [6]:
# This function is to remove stopwords from a particular column and to tokenize it
def rem_stopwords_tokenize(data,name):
      
    def getting(sen):
        example_sent = sen

        stop_words = set(stopwords.words('english')) 

        word_tokens = word_tokenize(example_sent) 

        filtered_sentence = [w for w in word_tokens if not w in stop_words] 

        filtered_sentence = [] 

        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w) 
        return filtered_sentence
    x=[]
    for i in data[name].values:
        x.append(getting(i))
    data[name]=x

In [7]:
# Making a function to lemmatize all the words
lemmatizer = WordNetLemmatizer() 
def lemmatize_all(data,name):
    arr = data[name]
    a = []
    for i in arr:
        b = []
        for j in i:
            x = lemmatizer.lemmatize(j,pos='a')
            x = lemmatizer.lemmatize(x)
            b.append(x)
        a.append(b)
    data[name] = a

In [8]:
def convert_to_string(data,name):
    t=data[name].values
    p=[]
    for i in t:
        listToStr = ' '.join(map(str, i))
        p.append(listToStr)
    data[name]=p

In [9]:
# Removing Stop words from Title Column
rem_stopwords_tokenize(filtered_articles, 'Title')

# Lemmatizing the Title column
lemmatize_all(filtered_articles, 'Title')

# Back to string
convert_to_string(filtered_articles, 'Title')

In [10]:
# Removing Stop words from Abstract Column
rem_stopwords_tokenize(filtered_articles, 'Abstract')

# Lemmatizing the Abstract column
lemmatize_all(filtered_articles, 'Abstract')

# Back to string
convert_to_string(filtered_articles, 'Abstract')

## Content-Based Filtering model

Create a vectorizer on behaviours train dataset

In [20]:
# Ignoring stopwords (words with no semantics) from English
stopwords_list = stopwords.words('english')

# Trains a model whose vectors size is 5000, composed by the main unigrams and bigrams found in the corpus, ignoring stopwords
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

item_ids = filtered_articles['NewsID'].tolist()
tfidf_matrix = vectorizer.fit_transform(filtered_articles['Title'] + "" + filtered_articles['Abstract'])
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix


<39726x1596 sparse matrix of type '<class 'numpy.float64'>'
	with 586288 stored elements in Compressed Sparse Row format>

In [21]:
def get_item_profile(item_id):
    # get a single item feature list by ID
    try:
        idx = item_ids.index(item_id)
    except:
        return None
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    # get item vector given all item IDs that user clicked
    item_profiles_list = [get_item_profile(x) for x in ids if x]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(person_id, interactions_df):
    # build feature vector for a single user

    # aggregate all news a user has clicked
    interactions_person_df = interactions_df.loc[person_id]
    user_item_profiles = get_item_profiles(interactions_person_df['All_History'])

    if user_item_profiles.shape[1] == 0:
        return None
    user_item_strengths = np.array([1] * user_item_profiles.shape[1])
    # Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm


def build_users_profiles(): 
    # build a global martix of features for all users on TRAIN data set

    user_profiles = {}
    for person_id in history_train_indexed_df.index.unique():
        user_profile = build_users_profile(person_id, history_train_indexed_df)
        if user_profile is not None:
            user_profiles[person_id] = user_profile
        else:
            print(f"No data for user {person_id}")
    return user_profiles

In [22]:
user_profiles = build_users_profiles()

In [23]:
len(user_profiles)

39718

In [29]:
my_user_id = 'U1014'

myprofile = user_profiles[my_user_id]
print(myprofile.shape)
pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        user_profiles[my_user_id].flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

(1, 1596)


Unnamed: 0,token,relevance
0,car,0.300643
1,ford,0.231713
2,look,0.166949
3,celebrity,0.162159
4,sold,0.147832
5,year,0.14077
6,speed,0.132169
7,star,0.130971
8,home,0.129266
9,halloween,0.116225


In [30]:
class ContentBasedRecommender:
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        # Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        # Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        # Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, ignore_interacted=True, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id, topn)
        # Ignores items the user has already interacted
        if ignore_interacted:
            similar_items = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items, columns=['NewsID', 'recStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
content_based_recommender_model = ContentBasedRecommender(filtered_articles)

### Recommendations for a single user

In [31]:
recs = content_based_recommender_model.recommend_items(my_user_id)
recs.set_index('NewsID')
pd.merge(recs, filtered_articles)

Unnamed: 0.1,NewsID,recStrength,Unnamed: 0,Category,SubCategory,Title,Abstract
0,N1375,0.354892,35773,autos,autosnews,Those classic Shelby race car 'Ford v Ferrari ...,But n't call `` replica '' `` kit car . ''
1,N58544,0.317736,21382,video,science,"Tiny Electric Car Cost $ 420,000 To Build",A MAN claim created car might solve world 's t...
2,N1942,0.30493,28957,autos,autossports,2020 Ford Mustang Shelby GT500 Is 760-HP Thril...,Ford 's ultimate pony car finely honed blast r...
3,N35346,0.302405,31031,autos,autosenthusiasts,This 1965 Ford Mustang Fastback On A Ford Bron...,The Frankensteinian result may pinnacle four-w...
4,N3345,0.300671,21047,autos,autosclassics,Man Reveals Big American Muscle Car Barn Find,These classic car definitely showroom condition .
5,N25845,0.300362,12264,news,newsus,"1 killed , 3 seriously hurt racing car crash u...",One person killed three others seriously injur...
6,N42208,0.289972,10898,autos,autosclassics,U.S . Marshals Auctioning Off 149 Vehicles Fro...,The large single car collection USMS ever sold !
7,N37859,0.288046,34764,autos,autosclassics,Icon 's electric 1949 Mercury V8-powered 1949 ...,These no-expense-spared custom car California ...
8,N15719,0.285456,35746,autos,autosenthusiasts,2020 Ford Mustang Shelby GT350 vs. GT500 : Whi...,"Is $ 12,460 costly GT500 good sport car alread..."
9,N44712,0.285223,34756,news,newscrime,"Car , evidence seized fatal hit-and-run invest...",A 57-year-old Methuen man struck killed car ri...


## Evaluation Class

In [32]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the news information.
    interacted_items = interactions_df.loc[person_id]['NewsID']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [33]:
filtered_articles.shape

(39726, 6)

In [34]:
# Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:
    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, behaviours_full_indexed_df)
        all_items = set(filtered_articles['NewsID'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):      
        try:
            index = next(i for i, c in enumerate(recommended_items) if c == item_id)
        except Exception as e:
            index = -1
        hit = int(index in range(0, topn))
        return hit, index

    def evaluate_model_for_user(self, model, person_id):
        # Getting the items in test set
        interacted_values_testset = behaviours_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['NewsID']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['NewsID'])
        else:
            person_interacted_items_testset = set([interacted_values_testset['NewsID']])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        # Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(
            person_id, 
            items_to_ignore=get_items_interacted(person_id, behaviours_train_indexed_df), 
            topn=10000000000, ignore_interacted=False)
        
        hits_at_5_count = 0
        hits_at_10_count = 0
        # For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            # Getting a random sample (100) items the user has not interacted 
            # (to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=random.randint(0, 2**32))

            # Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            # Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['NewsID'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['NewsID'].values
            # Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        # Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        # when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count': hits_at_5_count, 
                          'hits@10_count': hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(behaviours_test_indexed_df.index.unique().values)):
            if idx % 100 == 0 and idx > 0:
               print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()    

Reduce the test data set to 3000 elements to reduce computation time

In [35]:
behaviours_test_indexed_df_full = behaviours_test_indexed_df.copy()
behaviours_test_indexed_df = behaviours_test_indexed_df[:3000]

In [36]:
len(set(behaviours_test_indexed_df.index.values.tolist()))
behaviours_test_indexed_df.shape

(3000, 1)

In [37]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)

Evaluating Content-Based Filtering model...
Running evaluation for users
100 users processed
200 users processed
300 users processed
400 users processed
500 users processed
600 users processed
700 users processed
800 users processed
900 users processed
1000 users processed
1100 users processed
1200 users processed
1300 users processed
1400 users processed
1500 users processed
1600 users processed
1700 users processed
1800 users processed
1900 users processed
2000 users processed
2100 users processed
2200 users processed
2300 users processed
2400 users processed
2500 users processed
2600 users processed
2700 users processed
2800 users processed
2839 users processed


In [38]:
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.sort_values('recall@10', ascending=False).head(20)


Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.16, 'recall@10': 0.238}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
2313,0,1,1,0.0,1.0,U863
1560,0,1,1,0.0,1.0,U57567
1687,1,1,1,1.0,1.0,U1484
1688,0,1,1,0.0,1.0,U22992
2328,1,1,1,1.0,1.0,U24722
2326,1,1,1,1.0,1.0,U82289
2323,0,1,1,0.0,1.0,U70006
2310,1,1,1,1.0,1.0,U26140
2348,1,1,1,1.0,1.0,U69737
1574,1,1,1,1.0,1.0,U54436
