# News Recommender System with Content-based filtering

## 1. Get the data from sqlite database

### 1.1 import the relevant package to connect our database 

In [24]:
import numpy as np
import pandas as pd
import sqlite3

connection_db = sqlite3.connect('db.sqlite3')
cursor = connection_db.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('user_history',), ('article_database',), ('user_ratings',), ('sim_user_results',), ('rec_results',)]


### 1.2 Select the 'article database' and preview the data

In [20]:
# Preview the news data
article_df = pd.read_sql_query("SELECT * FROM article_database", connection_db, index_col = "articleID")
data = article_df
data.head(10)

Unnamed: 0_level_0,source,author,title,description,url,date
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Huffington Post,Melissa Jeltsen,There Were 2 Mass Shootings In Texas Last Week...,She left her husband. He killed their children...,https://www.huffingtonpost.com/entry/texas-ama...,5/26/18
2,Huffington Post,Andy McDonald,Will Smith Joins Diplo And Nicky Jam For The 2...,Of course it has a song.,https://www.huffingtonpost.com/entry/will-smit...,5/26/18
3,Huffington Post,Ron Dicker,Hugh Grant Marries For The First Time At Age 57,The actor and his longtime girlfriend Anna Ebe...,https://www.huffingtonpost.com/entry/hugh-gran...,5/26/18
4,Huffington Post,Ron Dicker,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,The actor gives Dems an ass-kicking for not fi...,https://www.huffingtonpost.com/entry/jim-carre...,5/26/18
5,Huffington Post,Ron Dicker,Julianna Margulies Uses Donald Trump Poop Bags...,"The ""Dietland"" actress said using the bags is ...",https://www.huffingtonpost.com/entry/julianna-...,5/26/18
6,Huffington Post,Ron Dicker,Morgan Freeman 'Devastated' That Sexual Harass...,"""It is not right to equate horrific incidents ...",https://www.huffingtonpost.com/entry/morgan-fr...,5/26/18
7,Huffington Post,Ron Dicker,Donald Trump Is Lovin' New McDonald's Jingle I...,"It's catchy, all right.",https://www.huffingtonpost.com/entry/donald-tr...,5/26/18
8,Huffington Post,Todd Van Luling,What To Watch On Amazon Prime That’s New This ...,There's a great mini-series joining this week.,https://www.huffingtonpost.com/entry/amazon-pr...,5/26/18
9,Huffington Post,Andy McDonald,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,"Myer's kids may be pushing for a new ""Powers"" ...",https://www.huffingtonpost.com/entry/mike-myer...,5/26/18
10,Huffington Post,Todd Van Luling,What To Watch On Hulu That’s New This Week,You're getting a recent Academy Award-winning ...,https://www.huffingtonpost.com/entry/hulu-what...,5/26/18


## 2. Start to building recommender system

### 2.1 import relevant package

In [25]:
# import relevant package
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import multiprocessing

# Concate the title and description as the 'text' information
data = data.fillna(" ")
data['text'] = data['title'] + " " + data['description']

# Check if all texts and urls have values
print(data["text"].isnull().values.any())
print(data["url"].isnull().values.any())

False
False


### 2.2 Building the object of Recommendation_System

In [16]:
class Recommendation_System():
    def text_preprocessing_function(self, text):
        # Tolower
        text = text.lower()
        # Removing punctuation (ex: !,-*)
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        # Removing stopwords (remove common and non meaningful words)
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
        # Removing numbers
        tokens = [token for token in tokens if not token.isdigit() ]
        # Remove tokens with length less than 2 
        tokens = [token for token in tokens if len(token) > 2 ]
        return tokens     
        
    def text_preprocessing(self, text):
        text = np.array(text)
        for i, t in enumerate(text):
            tokens = self.text_preprocessing_function(t)
            text[i] = " ".join(tokens)
        return text
    
    def get_tfidf_matrix(self, text):
        tfidf = TfidfVectorizer()
        tfidf_matrix = tfidf.fit_transform(text).toarray()
        tfidf_words = np.array(tfidf.get_feature_names_out())
        return tfidf_matrix, tfidf_words
    
    def train_w2v(self, w2v_df):
        cores = multiprocessing.cpu_count()
        w2v_model = Word2Vec(min_count=3,
                             window=4,
                             vector_size=300, 
                             alpha=0.03, 
                             min_alpha=0.0007, 
                             sg = 1,
                             workers=cores-1)

        w2v_model.build_vocab(w2v_df, progress_per=10000)
        w2v_model.train(w2v_df, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)
        return w2v_model
    
    def fit(self, data):
        self.data = data
        # Text Preprocessing
        text = self.text_preprocessing(self.data['text'])
        text = pd.Series(text)
        
        # TFIDF
        tfidf_matrix = self.get_tfidf_matrix(text)[0]
        tfidf_words = self.get_tfidf_matrix(text)[1]
        
        # Word2Vec
        w2v_df = list(text.apply(self.text_preprocessing_function))
        w2v_model = self.train_w2v(w2v_df)
        
        # Combination
        w2v_words = w2v_model.wv.index_to_key
        tfidf_sent_vectors = []

        for i, sent in enumerate(w2v_df):
            sent_vec = np.zeros(300) # vocabulary size
            weight_sum = 0
            for word in sent:
                if word in w2v_words:
                    # find the weight(tf_idf) of word in tfidf_matrix
                    word_index = np.where(tfidf_words == word)
                    tf_idf = tfidf_matrix[i][word_index]

                    vec = w2v_model.wv[word]
                    sent_vec += vec * tf_idf
                    weight_sum += tf_idf
            if weight_sum != 0:
                sent_vec = sent_vec / weight_sum
            tfidf_sent_vectors.append(sent_vec)

        tfidf_sent_vectors = np.array(tfidf_sent_vectors)
        
        # Cosine Similarity 
        self.similarity_matrix = cosine_similarity(tfidf_sent_vectors)
        pass
    
    def get_recommendation(self, text_input, n_recommendations = 10):
        indices = pd.Series(self.data.index, index = self.data['title']).drop_duplicates()
        idx = indices[text_input]
        sim_scores = list(enumerate(self.similarity_matrix[idx-1])) #-1 for this article index
        sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
        sim_scores = sim_scores[1:1+n_recommendations]
        news_indices = [i[0] for i in sim_scores]
        news_recommendation = self.data.iloc[news_indices][['title', 'url']]
        
        return news_recommendation

### 2.3 Train the Recommendation System

In [17]:
# Train the recommendation system
RS = Recommendation_System()
RS.fit(data.iloc[0:10000])

### 2.4 Input the news you are interested in and the number of recommendation

In [46]:
# Choose news you like
selected_news = data.title[1]
n_recommend = 20
print('The selected_news is: "{}" '.format(selected_news))

# Recommendation Result 
result = RS.get_recommendation(selected_news, n_recommendations = n_recommend)
result

The selected_news is: "There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV" 


Unnamed: 0_level_0,title,url
articleID,Unnamed: 1_level_1,Unnamed: 2_level_1
5127,Why Schools Are Flooded With Threats After Mas...,https://www.huffingtonpost.com/entry/schools-t...
2977,Vegas Golden Knights Retire Jersey No. 58 To H...,https://www.huffingtonpost.com/entry/las-vegas...
5560,Sen. Chris Murphy: Mass Shootings Are Congress...,https://www.huffingtonpost.com/entry/chris-mur...
5602,Here’s What You Need To Know About The Weapons...,https://www.huffingtonpost.com/entry/ar-15-sty...
743,7 Killed In What May Be Australia's Worst Mass...,https://www.huffingtonpost.com/entry/7-dead-au...
4136,Her Husband Killed 49 People In Orlando. Now S...,https://www.huffingtonpost.com/entry/noor-salm...
3989,"7,000 Pairs Of Shoes On Capitol Lawn Are Power...",https://www.huffingtonpost.com/entry/shoes-cap...
5146,Michelle Obama Tells Students To Keep Fighting...,https://www.huffingtonpost.com/entry/michelle-...
5255,James Corden Makes Powerful Call For Gun Contr...,https://www.huffingtonpost.com/entry/james-cor...
3457,73 Teens Shot To Death In The 37 Days Since Th...,https://www.huffingtonpost.com/entry/teens-kil...
