# IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import nltk
import gensim
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
stemmer=PorterStemmer()
from gensim import corpora, models
import re
stemmed_tokens=[]

# IMPORT DATA

In [2]:
df=pd.read_csv('news_articles.csv')
df=df[['Article_Id','Title','Content']].dropna()
doc_set=list(df['Content'])

# DATA PRE-PROCESSING

In [3]:
for i in range(4831):
    raw=doc_set[i].lower()
    raw=re.sub(r'\d+', '', raw)
    word_tokens = tokenizer.tokenize(raw)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    lemmatised_words=[lemmatizer.lemmatize(w) for w in filtered_sentence]
    stemmed_words=[stemmer.stem(i) for i in lemmatised_words]
    stemmed_tokens.append(stemmed_words)   

In [4]:
import nltk
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

# LDA MODEL

In [5]:
dictionary=corpora.Dictionary(stemmed_tokens)
doc_term_matrix=[dictionary.doc2bow(d) for d in stemmed_tokens]

In [6]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=20,minimum_probability=0.0)

In [7]:
#ldamodel=gensim.models.ldamodel.LdaModel(doc_term_matrix,num_topics=5,id2word=dictionary,passes=20,minimum_probability=0.0)

In [8]:
item=ldamodel[doc_term_matrix[1]]
item

[(0, 0.5316874),
 (1, 0.0017189463),
 (2, 0.0017448085),
 (3, 0.0017009544),
 (4, 0.4631479)]

In [9]:
topics=[]
for i in range(4831):
    item=ldamodel[doc_term_matrix[i]]
    item=np.array(item).T
    item=item[1]
    topics.append(item) 

np.shape(topics)


(4831, 5)

In [10]:
def get_lda_topics(model,num_topics):
    word_dict={};
    for i in range(num_topics):
        words=model.show_topic(i,topn=10);
        word_dict['Topic#' + '{:02d}'.format(i+1)]=[i[0] for i in words]
    return pd.DataFrame(word_dict);

In [11]:
get_lda_topics(ldamodel,5)

Unnamed: 0,Topic#01,Topic#02,Topic#03,Topic#04,Topic#05
0,film,india,india,devic,said
1,movi,olymp,said,featur,polic
2,khan,game,year,android,report
3,salman,rio,r,smartphon,attack
4,actor,world,govern,step,state
5,releas,indian,price,note,also
6,also,one,compani,new,peopl
7,star,win,market,instal,isi
8,day,team,modi,gb,kill
9,actress,event,crore,phone,year


In [12]:
topics=np.array(topics)
np.shape(topics)

(4831, 5)

# K-Means CLUSTERING

In [13]:
import sklearn
from sklearn.cluster import KMeans
kmeans=sklearn.cluster.KMeans(5).fit(topics)
clusters=kmeans.labels_

In [14]:
clusters

array([1, 0, 1, ..., 3, 3, 3])

# User Profile

In [15]:
df=pd.read_csv('news_articles.csv')
df=df[['Article_Id','Title','Content']].dropna()
content=list(df['Content'])
title=list(df["Title"])

length_article=([len(x) for x in df['Content']] ) #length of articles
user_speed=5
expected_time=[int((x/5)) for x in length_article]# expected time

possion = np.random.poisson(5, 4831)/10 

actual_time=expected_time*possion # 
possion        

array([1.1, 0.2, 0.5, ..., 0.3, 0.2, 0.8])

In [16]:
expected_time1=np.array(expected_time)
ratio=actual_time.reshape(4831,1)/expected_time1.reshape(4831,1)
ratio=ratio.reshape(4831)
click_through= np.random.binomial(1, .7, 4831).reshape(4831)

In [17]:
topics=np.array(topics)
topics=topics.reshape(4831,5)
Topic=pd.DataFrame(topics)

In [18]:
data=pd.DataFrame({"Title":title,"ratio":ratio,"click":click_through,"estimated":expected_time,"actual":actual_time})

In [19]:
weights=pd.DataFrame({"Weights":(data["click"]*data["ratio"])})
new_data=pd.concat([Topic,weights],axis=1)
#weights=np.array(weights).reshape(4831,1)

In [20]:
user_prof=pd.DataFrame()
n=1
for j in range(n):# number of users
    for i in range(5):
        user=(new_data[clusters==i][j:1+j])
        user_prof=user_prof.append(user)   


In [21]:
user_prof

Unnamed: 0,0,1,2,3,4,Weights
1,0.531688,0.001719,0.001745,0.001701,0.463147,0.2
0,0.001621,0.001593,0.038464,0.00159,0.956732,1.1
6,0.009832,0.019433,0.919275,0.050422,0.001038,0.8
870,0.001477,0.00146,0.001464,0.994143,0.001455,1.0
87,0.254641,0.625318,0.040014,0.040014,0.040014,0.3


In [22]:
for i in range(5):
    user_prof[i]=user_prof["Weights"]*user_prof[i]  

In [23]:
user_prof=user_prof.drop("Weights",axis=1)


In [24]:
user_prof=np.array(user_prof)
user_prof

array([[1.06337583e-01, 3.43789277e-04, 3.48953088e-04, 3.40190926e-04,
        9.26294804e-02],
       [1.78346663e-03, 1.75195184e-03, 4.23103616e-02, 1.74901985e-03,
        1.05240524e+00],
       [7.86532685e-03, 1.55466959e-02, 7.35419989e-01, 4.03379619e-02,
        8.30002595e-04],
       [1.47745607e-03, 1.45954441e-03, 1.46418239e-03, 9.94143486e-01,
        1.45529676e-03],
       [7.63923436e-02, 1.87595344e-01, 1.20040990e-02, 1.20041091e-02,
        1.20041069e-02]])

In [25]:
user_prof=np.array(user_prof)
user_prof=user_prof.reshape(n,5,5)
new_user=np.sum(user_prof,axis=1)

In [26]:
np.shape(new_user)

(1, 5)

In [27]:
#new_user=user_prof.sum(axis=0)

In [28]:
#new_user=np.array(new_user).reshape(1,5)

In [29]:
new_user

array([[0.19385618, 0.20669733, 0.79154758, 1.04857477, 1.15932412]])

# Cosine similarity

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
similarity=cosine_similarity(new_user,topics)

In [32]:
np.shape(similarity)

(1, 4831)

In [33]:
similarity

array([[0.67190864, 0.51418251, 0.73603957, ..., 0.59223338, 0.59210953,
        0.62546805]])

In [34]:
#similarity=np.array(similarity).reshape(-1,1)

In [35]:
#np.shape(similarity)

In [36]:
#similarity=(similarity).flatten()

In [37]:
top=np.argsort(similarity,axis=1)
np.shape(top)

(1, 4831)

In [38]:
print(top)

[[ 574   53 1903 ... 4663 1253 1836]]


In [39]:
top=top[:,-5:]
top
np.shape(top)

(1, 5)

In [40]:
new_recom=[]
for j in range(n):
    for i in range(5):
        unu=title[top[j][i]]
        new_recom.append(unu)   
 
    

In [41]:
new_recom

['4chan founder Christopher Poole joins Google',
 'All mobile phones in India to feature SOS button from March 2016  Govt',
 'Uber Ride Gets Bumpier With Ban in New Delhi  Death of Ride Share Model in India ',
 'Impossible to unlock new iPhones  Apple tells US court',
 'Godrej HIT Announces Nationwide Awareness Campaign on Dengue  Malaria']