In [109]:
# importing libraries and dependencies
import pandas as pd

# preprocessing
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
en_stopwords = stopwords.words('english')
from sklearn.feature_extraction.text import TfidfVectorizer

# modeling
from sklearn.neighbors import NearestNeighbors

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Data Science and Analytics/WTF Data Science and AI/Datasets/Selected Topics /shared_articles.csv', index_col=0)

In [None]:
# checking first 5 rows of the dataframe
df.head()

Unnamed: 0,url,title
0,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact..."
1,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact..."
2,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...
3,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour
4,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc..."


In [None]:
# check dataframe information
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3122 entries, 0 to 3121
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     3122 non-null   object
 1   title   3122 non-null   object
dtypes: object(2)
memory usage: 73.2+ KB


__Data Cleaning__

In [None]:
# check if there are null values
df.isna().sum()

Unnamed: 0,0
url,0
title,0


In [None]:
# check for duplicated entries
df.duplicated().sum()

90

In [None]:
# drop duplicates and reset index
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

### Preprocessing

In [None]:
stemmer = PorterStemmer()

In [None]:
def pre_process(article):
    article = article.lower() #convert articles to lower case
    clean_article = re.sub('[^a-z1-9 ]', '', article).strip() #remove punctuations and special characters

    tokens = word_tokenize(clean_article) #tokenization
    temp = []

    for token in tokens:
        if token not in en_stopwords: # remove stopwords
            stemmed_tokens = stemmer.stem(token) #stemming
            temp.append(stemmed_tokens)

    clean_article = ' '.join(temp)
    return clean_article

In [None]:
# defining x and applying the defined function to preprocess articles
x = df.title.apply(pre_process)
x

Unnamed: 0,title
0,ethereum virtual currenc enabl transact rival ...
1,bitcoin futur gbpcoin branson win usdcoin trump
2,googl data center 36 tour
3,ibm want evolv internet blockchain technolog
4,ieee talk blockchain cloud comput oxfordcon co...
...,...
3027,conhea liga iot plataforma de inovao aberta qu...
3028,amazon take skype gotomeet chime video confere...
3029,codeorg 216 annual report
3030,jpmorgan softwar second took lawyer 36 hour


### Vectorization

In [None]:
tf_idf = TfidfVectorizer()

In [None]:
x_trans = tf_idf.fit_transform(x).toarray()

In [None]:
x_trans.shape

(3032, 5908)

### Model

In [None]:
# model training
model =  NearestNeighbors()
model.fit(x_trans)

### Recommend

In [None]:
article_to_recommend = df.title.loc[0]
article_to_recommend

"Ethereum, a Virtual Currency, Enables Transactions That Rival Bitcoin's"

In [None]:
pre_process(article_to_recommend)

'ethereum virtual currenc enabl transact rival bitcoin'

In [None]:
def predict(article):
  clean_article = pre_process(article)
  clean_article_trans = tf_idf.transform([clean_article])
  indices = model.kneighbors(clean_article_trans, n_neighbors= 5, return_distance=False)[0]
  for index in indices:
    print(df.title.loc[index])

In [None]:
# predicting the nearest 5 articles to the selected article
predict(article_to_recommend)

Ethereum, a Virtual Currency, Enables Transactions That Rival Bitcoin's
Microsoft Continues to Embrace Ethereum & Bitcoin - Bitcoin News
Ethereum and Bitcoin Are Market Leaders But Not Competitors
Blockchain open sources Thunder network, paving the way for instant bitcoin transactions
Bitcoin Accepted! German Energy Giant Enables Payments - CCN: Financial Bitcoin & Cryptocurrency News


__OOP Class for Recommender System__

In [None]:
class NewsRS:
    def choose_article(self, article_index):
         article = df.title.loc[article_index]
         return article

    def pre_process(self, article_index):
        article_title = df.title.loc[article_index]
        article = article_title.lower()
        clean_article = re.sub('[^a-z1-9 ]', '', article).strip()

        tokens = word_tokenize(clean_article)
        temp = []

        for token in tokens:
           if token not in en_stopwords:
                stemmed_tokens = stemmer.stem(token)
                temp.append(stemmed_tokens)

        clean_article = ' '.join(temp)
        return clean_article

    def predict(self, article_index):
        article = df.title.loc[article_index]
        clean_article = pre_process(article)
        clean_article_trans = tf_idf.transform([clean_article])
        indices = model.kneighbors(clean_article_trans, n_neighbors= 5, return_distance=False)[0]
        for index in indices:
           print(df.title.loc[index])

In [None]:
recommender = NewsRS()

In [None]:
recommender.choose_article(101)

'What Happens When You Combine Artificial Intelligence and Satellite Imagery'

In [None]:
recommender.pre_process(101)

'happen combin artifici intellig satellit imageri'

In [None]:
recommender.predict(101)

What Happens When You Combine Artificial Intelligence and Satellite Imagery
Artificial Intelligence Can Now Design Realistic Video and Game Imagery
This startup uses machine learning and satellite imagery to predict crop yields
SpaceNet satellite imagery repository launched by DigitalGlobe, CosmiQ Works and NVIDIA on AWS
Artificial Intelligence Software Is Booming. But Why Now?


In [107]:
recommender.choose_article(120)

'New blog from Dries: How should you decouple Drupal?'

In [108]:
recommender.predict(120)

New blog from Dries: How should you decouple Drupal?
New blog from Dries: A "MAP" for accelerating Drupal 8 adoption
There will be a Drupal 9, and here is why
The Risks and Rewards of Fully Decoupling Drupal | Acquia
Who sponsors Drupal development? | Dries Buytaert
