## Import the relevant libraries 

In [None]:
import pandas as pd
import guidedlda
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import csv

## Define function for preprocessing text

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove numbers
    text = re.sub(r'et al\.?', '', text) # remove "et al."
    return text

## Read the data

In [None]:
data = pd.read_csv('papers.csv')
corpus = data['paper_text'].values.tolist()
authors = pd.read_csv("authors.csv")

## Fit the count vectorizer on the corpus by applying stopword removal and preprocessing

In [None]:
vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=10000, stop_words='english', preprocessor=preprocess_text)
X = vectorizer.fit_transform(corpus)

vocab = vectorizer.get_feature_names()

## Fit the unguided LDA model

In [None]:
model = guidedlda.GuidedLDA(n_topics=5, n_iter=200, random_state=42, refresh=20)
model.fit(X.toarray())

In [None]:
n_top_words = 50
topic_word_unseeded = model.topic_word_
topic_to_pick = 4

for i, topic_dist in enumerate(topic_word_unseeded):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ','.join(topic_words)))
    


In [None]:
topic_words_unseeded_final =  np.array(vocab)[np.argsort(topic_word_unseeded[topic_to_pick])][:-(n_top_words+1):-1]

## Fit the topic model without seed keywords

In [None]:
seed_keywords = [["image processing","convolutional neural","deep convolutional","object detection","object recognition","computer vision"]]


In [None]:
model_guided = guidedlda.GuidedLDA(n_topics=5, n_iter=200, random_state=42, refresh=20)

seed_topics = {}
for st in seed_keywords:
    for word in st:
        seed_topics[vocab.index(word)] = 4

        
model_guided.fit(X.toarray(), seed_topics=seed_topics, seed_confidence=0.7)

In [None]:
n_top_words = 50
topic_word_seeded = model_guided.topic_word_
for i, topic_dist in enumerate(topic_word_seeded):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ','.join(topic_words)))

In [None]:
topic_words_seeded_final =  np.array(vocab)[np.argsort(topic_word_seeded[topic_to_pick])][:-(n_top_words+1):-1]

## Create word cloud for the unseeded topic 5

In [None]:
text = ["_".join(x.split()) for x in topic_words_unseeded_final]

word_priorities = {k:idx for idx,k in enumerate(text)}


# Create a word cloud object
wordcloud = WordCloud(width=2000, height=2000, background_color='white', min_font_size=10)

wordcloud.generate_from_frequencies(word_priorities)

plt.figure(figsize=(8,8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

## Create word cloud for the seeded topic 5

In [None]:
# Define your text

text = ["_".join(x.split()) for x in topic_words_seeded_final]

word_priorities = {k:idx for idx,k in enumerate(text)}


# Create a word cloud object
wordcloud = WordCloud(width=2000, height=2000, background_color='white', min_font_size=10)

wordcloud.generate_from_frequencies(word_priorities)


# Create a word cloud object
wordcloud = WordCloud(width=1000, height=1000, background_color='white', min_font_size=10).generate(",".join(text))

# Display the word cloud
plt.figure(figsize=(8,8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

## Save the built models

In [None]:
import os

os.mkdir("saved_models")

with open('saved_models/guidedlda_model.pickle', 'wb') as file_handle:
    pickle.dump(model_guided, file_handle)
    
with open('saved_models/unguidedlda_model.pickle', 'wb') as file_handle:
    pickle.dump(model, file_handle)

## OPTIONAL: Load the built model

In [None]:
# # open a file, where you stored the pickled data
# file = open('saved_models/guidedlda_model.pickle', 'rb')

# # dump information to that file
# guided_prev_version = pickle.load(file)

## Plot papers picked across years for topic 5

In [None]:
year_to_paper = {}

for index,paper in data.iterrows():
        
    if paper["year"] not in year_to_paper:
        year_to_paper[paper["year"]] = [paper["paper_text"]]
    else:
        year_to_paper[paper["year"]].append(paper["paper_text"])

categories = {"1987-1990":[],}

In [None]:
years_dict = {
    "1987-1990": list(range(1987, 1991)),
    "1991-1995": list(range(1991, 1996)),
    "1996-2000": list(range(1996, 2001)),
    "2001-2005": list(range(2001, 2006)),
    "2006-2010": list(range(2006, 2011)),
    "2011-2015": list(range(2011, 2016)),
    "2016-2017": list(range(2016, 2018))
}

In [None]:
inv_years_dict = {}

# Loop through the items in the original dictionary and add them to the inverse dictionary
for key, value in years_dict.items():
    for year in value:
        inv_years_dict[year] = key

In [None]:
predicted_topics = model_guided.transform(X.toarray())

In [None]:
year_prominent_paper = {}

## Pick the top 5 papers across 5 year bins which have the highest topic score for CV by the guided topic model

In [None]:
for prediction , df_item in tqdm(zip(predicted_topics, data.values.tolist())):
    
    year = df_item[1]
    title = df_item[2]
    
    year_bucket = inv_years_dict[year]
    
    if year_bucket not in year_prominent_paper:
        year_prominent_paper[year_bucket] = []
        
    year_prominent_paper[year_bucket].append({"title":title, "cv_topic_score":prediction[-1]})
    
    

In [None]:
# Initialize a dictionary to store the top 5 papers for each year
top_papers = {}

# Loop through each key in the dictionary
for key in year_prominent_paper:
    # Sort the list of dictionaries by cv_topic_score in descending order
    sorted_list = sorted(year_prominent_paper[key], key=lambda x: x['cv_topic_score'], reverse=True)
    # Get the top 5 titles for this key
    top_titles = [d['title'] for d in sorted_list[:5]]
    # Add the top 5 titles to the dictionary
    top_papers[key] = top_titles

# Define the column names for the CSV file
fieldnames = list(top_papers.keys())

# Write the rows to a CSV file
with open('output.csv', mode='w') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(fieldnames)
    for i in range(5):
        row = [top_papers[key][i] for key in fieldnames]
        writer.writerow(row)