In [None]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [None]:
import nltk
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
from scholar_sense.viz.utils import plot_length_distribution, plot_top_words

In [None]:
PATH_DATA_BASE = Path.cwd().parent / "artifacts/data"

In [None]:
# Setting pandas option to display the full content of DataFrame columns without truncation
pd.set_option('display.max_colwidth', None)

arxiv_data = pd.read_csv(PATH_DATA_BASE / "arxiv.csv")
arxiv_data.head()

In [None]:
arxiv_data.info()

In [None]:
print(f"There are {len(arxiv_data)} rows in the dataset.")

In [None]:
total_duplicate_titles = sum(arxiv_data["id"].duplicated())
print(f"There are {total_duplicate_titles} duplicate titles.")

In [None]:
arxiv_data["categories"] = arxiv_data["categories"].apply(ast.literal_eval)
arxiv_data.head(1)


In [None]:
arxiv_data["authors"] = arxiv_data["authors"].apply(ast.literal_eval)
arxiv_data.head()

In [None]:
# Get all terms
all_terms = [term for sublist in arxiv_data["categories"].tolist() for term in sublist]

# Count terms
terms_count = Counter(all_terms)

# Create dataframe
df_terms = pd.DataFrame.from_dict(terms_count, orient='index').reset_index()
df_terms.columns = ['Term', 'Count']

# Sort by count and take the top 10
df_terms_top10 = df_terms.sort_values('Count', ascending=False).head(10)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(x='Count', y='Term', data=df_terms_top10, color='cornflowerblue')

# Remove top and right spines
sns.despine()

plt.title('Top 10 Terms by Paper Count')
plt.show()

In [None]:
from wordcloud import WordCloud

# Concatenate all titles
all_titles = ' '.join(arxiv_data['title'].tolist())

# Create word cloud
wordcloud = WordCloud(background_color = 'white', width=800, height=400).generate(all_titles)

# Plot
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
arxiv_data = arxiv_data[~arxiv_data["title"].duplicated()].copy()
print(f"There are {len(arxiv_data)} rows in the deduplicated dataset.")

In [None]:
arxiv_data.to_csv(PATH_DATA_BASE / 'filtered_data.csv', index=False)

# Titels

In [None]:
average_title_length = int(arxiv_data['title'].apply(len).mean())
print(f"The average text length of a title is {average_title_length} characters.")

In [None]:
# Calculate the length of each title
arxiv_data['title_length'] = arxiv_data['title'].apply(len)

In [None]:
arxiv_data["title_length"].describe()

In [None]:
plot_length_distribution(arxiv_data, 'title_length')

In [None]:
plot_top_words(arxiv_data, 'title')

# Abstract

In [None]:
average_abstract_length = int(arxiv_data["abstract"].apply(len).mean())
print(f"The average text length of an abstract is {average_abstract_length} characters.")

In [None]:
# Calculate the length of each abstract
arxiv_data['abstract_length'] = arxiv_data["abstract"].apply(len)

In [None]:
arxiv_data["abstract_length"].describe()

In [None]:
plot_length_distribution(arxiv_data, "abstract_length")

In [None]:
plot_top_words(arxiv_data, "abstract")