# IMPORT LIBRARIES

In [406]:
import numpy as np
import pandas as pd
import nltk
import gensim
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
stemmer=PorterStemmer()
from gensim import corpora, models
import re
stemmed_tokens=[]

# IMPORT DATA

In [407]:
df=pd.read_csv('news_articles.csv')
df=df[['Article_Id','Title','Content']].dropna()
doc_set=list(df['Content'])

# DATA PRE-PROCESSING

In [408]:
for i in range(4831):
    raw=doc_set[i].lower()
    raw=re.sub(r'\d+', '', raw)
    word_tokens = tokenizer.tokenize(raw)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    lemmatised_words=[lemmatizer.lemmatize(w) for w in filtered_sentence]
    stemmed_words=[stemmer.stem(i) for i in lemmatised_words]
    stemmed_tokens.append(stemmed_words)   

In [409]:
import nltk
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

# LDA MODEL

In [410]:
dictionary=corpora.Dictionary(stemmed_tokens)
doc_term_matrix=[dictionary.doc2bow(d) for d in stemmed_tokens]

In [411]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=20,minimum_probability=0.0)

In [412]:
#ldamodel=gensim.models.ldamodel.LdaModel(doc_term_matrix,num_topics=5,id2word=dictionary,passes=20,minimum_probability=0.0)

In [413]:
item=ldamodel[doc_term_matrix[1]]
item

[(0, 0.02867984),
 (1, 0.6476206),
 (2, 0.0017051487),
 (3, 0.32026228),
 (4, 0.0017320783)]

In [414]:
topics=[]
for i in range(4831):
    item=ldamodel[doc_term_matrix[i]]
    item=np.array(item).T
    item=item[1]
    topics.append(item) 

np.shape(topics)


(4831, 5)

In [415]:
def get_lda_topics(model,num_topics):
    word_dict={};
    for i in range(num_topics):
        words=model.show_topic(i,topn=10);
        word_dict['Topic#' + '{:02d}'.format(i+1)]=[i[0] for i in words]
    return pd.DataFrame(word_dict);

In [416]:
get_lda_topics(ldamodel,5)

Unnamed: 0,Topic#01,Topic#02,Topic#03,Topic#04,Topic#05
0,india,said,devic,film,india
1,year,report,featur,movi,olymp
2,said,polic,android,khan,game
3,r,state,smartphon,salman,world
4,price,also,step,actor,one
5,govern,minist,note,releas,rio
6,compani,attack,instal,also,indian
7,market,peopl,gb,star,win
8,crore,govern,new,day,year
9,bank,india,phone,actress,team


In [417]:
topics=np.array(topics)
np.shape(topics)

(4831, 5)

# K-Means CLUSTERING

In [418]:
import sklearn
from sklearn.cluster import KMeans
kmeans=sklearn.cluster.KMeans(5).fit(topics)
clusters=kmeans.labels_

In [419]:
clusters

array([0, 0, 0, ..., 1, 1, 1])

# User Profile

In [420]:
df=pd.read_csv('news_articles.csv')
df=df[['Article_Id','Title','Content']].dropna()
content=list(df['Content'])
title=list(df["Title"])

length_article=([len(x) for x in df['Content']] ) #length of articles
user_speed=5
expected_time=[int((x/5)) for x in length_article]# expected time

possion = np.random.poisson(5, 4831)/10 

actual_time=expected_time*possion # 
possion        

array([0.5, 0.3, 0.5, ..., 0.5, 0.7, 0.3])

In [421]:
expected_time1=np.array(expected_time)
ratio=actual_time.reshape(4831,1)/expected_time1.reshape(4831,1)
ratio=ratio.reshape(4831)
click_through= np.random.binomial(1, .7, 4831).reshape(4831)

In [422]:
topics=np.array(topics)
topics=topics.reshape(4831,5)
Topic=pd.DataFrame(topics)

In [423]:
data=pd.DataFrame({"Title":title,"ratio":ratio,"click":click_through,"estimated":expected_time,"actual":actual_time})

In [424]:
weights=pd.DataFrame({"Weights":(data["click"]*data["ratio"])})
new_data=pd.concat([Topic,weights],axis=1)
#weights=np.array(weights).reshape(4831,1)

In [425]:
user_prof=pd.DataFrame()
for j in range(4):
    for i in range(5):
        user=(new_data[clusters==i][:j])
        user_prof=user_prof.append(user)   


In [426]:
user_prof

Unnamed: 0,0,1,2,3,4,Weights
0,0.035946,0.959247,0.001589,0.001617,0.0016,0.5
388,0.106103,0.106103,0.575447,0.106151,0.106196,0.4
6,0.9578,0.001038,0.001051,0.011293,0.028819,0.2
27,0.002333,0.249755,0.043607,0.616855,0.087449,0.0
249,0.000875,0.000859,0.000858,0.224441,0.772967,0.8
0,0.035946,0.959247,0.001589,0.001617,0.0016,0.5
1,0.028683,0.647619,0.001705,0.320261,0.001732,0.3
388,0.106103,0.106103,0.575447,0.106151,0.106196,0.4
870,0.001467,0.001456,0.994141,0.001476,0.001459,0.0
6,0.9578,0.001038,0.001051,0.011293,0.028819,0.2


In [427]:
for i in range(5):
    user_prof[i]=user_prof["Weights"]*user_prof[i]  

In [428]:
user_prof=user_prof.drop("Weights",axis=1)

In [429]:
user_prof

Unnamed: 0,0,1,2,3,4
0,0.017973,0.479623,0.000795,0.000809,0.0008
388,0.042441,0.042441,0.230179,0.042461,0.042479
6,0.19156,0.000208,0.00021,0.002259,0.005764
27,0.0,0.0,0.0,0.0,0.0
249,0.0007,0.000687,0.000687,0.179553,0.618373
0,0.017973,0.479623,0.000795,0.000809,0.0008
1,0.008605,0.194286,0.000512,0.096078,0.00052
388,0.042441,0.042441,0.230179,0.042461,0.042479
870,0.0,0.0,0.0,0.0,0.0
6,0.19156,0.000208,0.00021,0.002259,0.005764


In [430]:
new_user=user_prof.sum(axis=0)

In [431]:
new_user=np.array(new_user).reshape(1,5)

In [432]:
new_user

array([[2.05400611, 2.13304939, 1.17033128, 1.79877269, 3.34384042]])

# Cosine similarity

In [433]:
from sklearn.metrics.pairwise import cosine_similarity

In [434]:
similarity=cosine_similarity(new_user,topics)
topics

array([[0.03594604, 0.95924693, 0.00158927, 0.00161741, 0.00160035],
       [0.02868252, 0.64761883, 0.00170515, 0.32026142, 0.00173207],
       [0.0014611 , 0.83627737, 0.00144919, 0.00145057, 0.15936181],
       ...,
       [0.05873723, 0.00110028, 0.93796504, 0.00109831, 0.00109914],
       [0.01601425, 0.0010091 , 0.98096669, 0.00100374, 0.00100626],
       [0.03374894, 0.00172708, 0.7209729 , 0.06950273, 0.17404833]])

In [435]:
similarity=np.array(similarity).reshape(-1,1)

In [436]:
similarity=(similarity).flatten()

In [437]:
top=np.argsort(similarity)

In [438]:
top=top[-5:]

In [439]:
top

array([3391, 1023, 1647, 1323, 1876], dtype=int32)

# New Recommendations

In [440]:
new_recom=[]
for i in range(5):
    unu=title[top[i]]
    new_recom.append(unu)    

In [441]:
unu

'Net Neutrality  TRAI Receives over 2 Lakh Mails after AIB s  Save the Internet  Campaign Goes Viral'

In [442]:
new_recom

['Sri Lanka may be LCA Tejas  first export customer  Report',
 'After The Pirate Bay  KickassTorrents  KAT  goes offline  Here s 6 best alternate BitTorrent sites',
 'Sania Mirza disinvited by MP govt after she demands make-up kit worth Rs 75 000',
 'Google wants to represent gender equality through emoji',
 'Net Neutrality  TRAI Receives over 2 Lakh Mails after AIB s  Save the Internet  Campaign Goes Viral']