In [None]:
# Standard Libraries
import pandas as pd
import numpy as np
import json

# Data Preprocessing & NLP
import nltk
import re
import string
import gensim
from textblob import Word

from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
nltk.download('all')
nltk.download('punkt')

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import linear_kernel

# Performance metrics
from surprise import SVD
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\ARYAN\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\ARYAN\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\ARYAN\AppData\Roaming\nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\ARYAN\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\ARYAN\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     C:\Users\A

In [None]:
# file type trasformation: json to csv
df = pd.read_json('data/arxivData.json')
df.to_csv('data/arxivData.csv', index='id')

In [None]:
arxivData = pd.read_csv('data/arxivData.csv')

In [None]:
arxivData.columns

In [None]:
#dropping irrelevant columns
columns_to_delete = ['Unnamed: 0', 'id', 'day', 'month']
arxivData.drop(columns_to_delete, inplace=True, axis=1)

In [None]:
arxivData.head()

In [None]:
from ast import literal_eval
# convert 'stringfield' lists to usable structure
features = ['author', 'link', 'tag']
for feature in features:
    arxivData[feature] = arxivData[feature].apply(literal_eval)

In [None]:
arxivData.head()

In [None]:
def get_names(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

def get_link(x):
    for i in x:
        return i['href']
    
def get_tag(x):
    if isinstance(x, list):
        terms = [i['term'] for i in x]
        #Check if more than 5 elements exist. If yes, return only first five. If no, return entire list.
        if len(terms) > 5:
            terms = terms[:5]
        return terms


In [None]:
# list transformation
arxivData['author'] = arxivData['author'].apply(get_names)
arxivData['link'] = arxivData['link'].apply(get_link)
arxivData['tag'] = arxivData['tag'].apply(get_tag)

In [None]:
arxivData.head()

In [None]:
arxivData.shape

In [None]:
# Data Cleaning
def clean_text(text):
    # remove everything except alphabets
    text = re.sub("[^a-zA-Z]", " ", text)
    # remove whitespaces
    text = ' '.join(text.split())
    text = text.lower()
    
    return text

In [None]:
# creating clean text feature
features = ['title', 'summary']
for feature in features:
    arxivData['clean_' + feature] = arxivData[feature].apply(clean_text)

In [None]:
arxivData.head()

In [None]:
arxivData['soup'] = arxivData['clean_title'] + arxivData['clean_summary']

In [None]:
def random_color_func(word=None, font_size=None, position=None,
                      orientation=None, font_path=None, random_state=None):
    h = int(360.0 * 55.0 / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(70, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)

def freq_words(x, terms = 30):
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()
    
    freq_dist = nltk.FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(freq_dist.keys()), 'count':list(freq_dist.values())})
    
    fig = plt.figure(figsize=(21,16))
    ax1 = fig.add_subplot(2,1,1)
    wordcloud = WordCloud(width=1000, height=300, background_color='black', 
                          max_words=1628, relative_scaling=1,
                          color_func = random_color_func,
                          normalize_plurals=False).generate_from_frequencies(freq_dist)
    
    ax1.imshow(wordcloud, interpolation="bilinear")
    ax1.axis('off')
    
    # select top 20 most frequent word
    ax2 = fig.add_subplot(2,1,2)
    d = words_df.nlargest(columns="count", n = terms) 
    ax2 = sns.barplot(data=d, palette = sns.color_palette('BuGn_r'), x= "count", y = "word")
    ax2.set(ylabel= 'Word')
    plt.show()

In [None]:
# plot 25 most frequent words including stop words
freq_words(arxivData['soup'], 25)

In [None]:
# stopwords-to compare text data with and without stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)
  
arxivData['soup'] = arxivData['soup'].apply(lambda x: remove_stopwords(x))

In [None]:
# plot 25 most frequent words without stopwords
freq_words(arxivData['soup'], 25)

In [None]:
# get all genre tags in a list
all_tags = sum(arxivData['tag'],[])
len(set(all_tags))

In [None]:
all_tags = nltk.FreqDist(all_tags)
all_tags_df = pd.DataFrame({'Tag': list(all_tags.keys()), 'Count': list(all_tags.values())})

In [None]:
sorted_tags = all_tags_df.sort_values(by='Count', ascending=False)
sorted_tags.head(7)

In [None]:
arxivData[['tag', 'year']].head()

In [None]:
g = all_tags_df.nlargest(columns="Count", n = 25) 
plt.figure(figsize=(12,15))
ax = sns.barplot(data=g, x= "Count", y = "Tag")
ax.set(ylabel = 'Tags')
plt.show()

In [None]:
text = " ".join(review for review in g.Tag)
wordcloud = WordCloud(width=1000, height=500,max_font_size=200).generate(text)

plt.figure(figsize = (12, 10))
plt.imshow(wordcloud, interpolation='lanczos')
plt.axis("off")
plt.show()

In [None]:
text = " ".join(review for review in arxivData.clean_title)
wordcloud = WordCloud(width=1600, height=800,max_font_size=200, colormap='magma').generate(text)

plt.figure(figsize = (12, 10))
plt.imshow(wordcloud, interpolation='spline36')
plt.axis("off")
plt.show()

In [None]:
text = " ".join(review for review in arxivData.clean_summary)
wordcloud = WordCloud(width=1600, height=800,max_font_size=200, colormap='magma').generate(text)

plt.figure(figsize = (12, 10))
plt.imshow(wordcloud, interpolation='spline36')
plt.axis("off")
plt.show()

In [None]:
wordcloud.to_file("first_review.png")

In [None]:
# Lemmatization process
'''
Words in the third person are changed to first person and verbs in past and future tenses are changed into the present by the 
lemmatization process. 
'''
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    # tokenization to ensure that punctuation is caught as its own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    lem = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    return lem

In [None]:
# Defining a Count Vectorizer object
count_vec = CountVectorizer(stop_words='english', max_features=10000)
# Defining a TF-IDF Vectorizer
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), tokenizer=tokenize_and_lemmatize, max_features=10000, use_idf=True)

In [None]:
arxivData.columns

In [None]:
df2 = pd.DataFrame(df.author.str.split('}').tolist(),index = df.index).stack()
df2.head()

In [None]:
def rem_unwanted(line):
    return re.sub("\'term'|\'rel'|\'href'|\'type'|\'title'|\[|\{|\'name'|\'|\]|\,|\}",'',line).strip(' ').strip("''").strip(":")

In [None]:
df2 = pd.DataFrame(df2.apply(rem_unwanted))
df2.head()

In [None]:
df2 = pd.DataFrame(df2.unstack().iloc[:,0:2].to_records()).drop(columns={'index'})
df2.head()

In [None]:
df2.columns = ['Author1','Author2']
df2.Author1 = df2.Author1.str.strip(' ')
df2.Author2 = df2.Author2.str.strip(' ')

In [None]:
df2[df2.Author2 == '']
df2 = df2.reset_index().drop(columns='index')
df2.head()

In [None]:
arxivData = pd.merge(arxivData,df2,how = 'inner',left_index=True,right_index=True).drop('author',axis=1)
arxivData.head()

In [None]:
mb = MultiLabelBinarizer()
mb.fit(arxivData['tag'])

y = mb.transform(arxivData['tag'])

In [None]:
len(y)

In [None]:
len(arxivData['soup'])

## Recommender based on summary

In [None]:
arxivData['clean_summary'].head()

In [None]:
# TfIdf matrix transformation on clean_summary column
tfidf_matrix = tfidf_vec.fit_transform(arxivData['clean_summary'])
# Compute the cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(arxivData.index, index=arxivData['title']).drop_duplicates()

In [None]:
def get_recommendations(title, similarity):
    
    idx = indices[title]
    # pairwsie similarity scores
    sim_scores = list(enumerate(similarity[idx]))
    # sorting
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]

    article_indices = [i[0] for i in sim_scores]
    # Return the top 10 most related articles
    return arxivData[['link', 'title']].iloc[article_indices]

In [None]:
get_recommendations('Towards Bayesian Deep Learning: A Survey', cosine_sim)

## Recommender based on tags, author, and title

In [None]:
# convert all strings to lower case & strip names of spaces
def clean_lists(text):
    if isinstance(text, list):
        return [str.lower(i.replace(" ", "")) for i in text]
    else:
        if isinstance(text, str):
            return str.lower(text.replace(" ", ""))
        else:
            return ''

features = ['tag', 'Author2']
for feature in features:
    arxivData[feature] = arxivData[feature].apply(clean_lists)

In [None]:
# create soup to vectorization process
def create_soup(text):
    return ' '.join(text['tag']) + ' ' + ' '.join(text['Author1'])+ ' ' + ' '.join(text['Author2']) + ' ' + ' '.join(text['title'])

arxivData['soup2'] = arxivData.apply(create_soup, axis=1)

In [None]:
# CountVectorizer object's defined at the beginning of Part 3-text mining alg. section
count_matrix = count_vec.fit_transform(arxivData['soup2']) 
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
get_recommendations('Towards Bayesian Deep Learning: A Survey', cosine_sim2)

In [None]:
get_recommendations('A Deep Reinforcement Learning Chatbot', cosine_sim2)