In [None]:
# Core imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
%matplotlib inline
sns.set_style('whitegrid')

In [None]:
# Load data file (use full metadata.csv if available, otherwise fallback to sample)
base = os.path.join(os.getcwd(), '..')  # notebook sits inside /notebooks
paths = [
    os.path.join(base, 'data', 'metadata.csv'),
    os.path.join(base, 'data', 'metadata_sample.csv')
]
data_path = None
for p in paths:
    if os.path.exists(p):
        data_path = p
        break
if data_path is None:
    raise FileNotFoundError('No metadata file found in data/. Place metadata.csv or metadata_sample.csv in the data/ folder')
print('Loading', data_path)
df = pd.read_csv(data_path, low_memory=False)
df.shape

In [None]:
# Quick preview and info
display(df.head())
print('
Dataframe info:')
print(df.dtypes)
print('
Missing value counts for key columns:')
print(df[['title','abstract','publish_time','journal','source_x']].isnull().sum())

## Cleaning and feature engineering
We'll convert the publish time into datetime, extract a `year` column and add an `abstract_word_count`. We'll keep only rows where we have at least a title or abstract for analysis tasks.

In [None]:
# Convert dates and add year column
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['year'] = df['publish_time'].dt.year
# Add abstract word count (fill missing abstract with empty string first)
df['abstract'] = df['abstract'].fillna('')
df['abstract_word_count'] = df['abstract'].str.split().apply(len)
# Basic filtering: keep rows that have a title or abstract text
df_clean = df[(df['title'].notna()) | (df['abstract_word_count'] > 0)].copy()
print('Before:', len(df), 'After cleaning:', len(df_clean))
df_clean[['title','abstract','publish_time','year','abstract_word_count']].head()

### Publications by year
Count papers by publication year and visualize a simple bar chart.

In [None]:
year_counts = df_clean['year'].value_counts().sort_index()
plt.figure(figsize=(8,4))
sns.barplot(x=year_counts.index.astype('Int64'), y=year_counts.values, palette='Blues_d')
plt.title('Publications by Year')
plt.xlabel('Year')
plt.ylabel('Number of papers')
plt.show()

### Top publishing journals
Show the most frequent journals in the dataset.

In [None]:
top_journals = df_clean['journal'].value_counts().nlargest(10)
plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values, palette='viridis')
plt.title('Top journals by paper count')
plt.xlabel('Number of papers')
plt.ylabel('Journal')
plt.show()

### Most frequent words in titles
We’ll do a simple tokenization, remove stop words and punctuation, and show a frequency table and word cloud.

In [None]:
import re

def simple_tokens(text):
    text = (text or '').lower()
    text = re.sub(r'[^a-z]', ' ', text)
    tokens = [t for t in text.split() if t and t not in stop_words]
    return tokens

all_title_tokens = []
for t in df_clean['title'].fillna(''):
    all_title_tokens += simple_tokens(t)
cnt = Counter(all_title_tokens)
most_common = cnt.most_common(30)
pd.DataFrame(most_common, columns=['word','count']).head(15)

In [None]:
# Word cloud for titles
wc = WordCloud(width=900, height=400, background_color='white', stopwords=stop_words).generate(' '.join(all_title_tokens))
plt.figure(figsize=(12,5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Wordcloud of title words')
plt.show()

### Distribution of papers by source (source_x)

In [None]:
src_counts = df_clean['source_x'].value_counts()
plt.figure(figsize=(6,4))
sns.barplot(x=src_counts.index, y=src_counts.values, palette='magma')
plt.title('Paper counts by source')
plt.xlabel('Source')
plt.ylabel('Count')
plt.show()

### Wrap up / next steps
This notebook demonstrates the analysis pipeline requested in the assignment. For the full dataset you may: explore abstracts with TF-IDF, do topic modeling, include more interactive plots, or add filtering by author or journal.

## TF-IDF and topic modeling (starter)

Below are example steps you can use to vectorize titles/abstracts with TF-IDF and run a lightweight LDA topic model on the collection. This is intentionally simple so it will run quickly on the sample dataset; for the full `metadata.csv` you may want to sample or increase compute resources.

In [None]:
# Additional imports for TF-IDF and LDA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Prepare a corpus - prefer abstract if available, otherwise title
corpus = df_clean['abstract'].fillna('').replace('', pd.NA).fillna(df_clean['title'].fillna(''))
corpus = corpus.astype(str).str.strip()
corpus = corpus[corpus.str.len() > 0]
len(corpus)

In [None]:
# TF-IDF — show top features for the corpus (small demo)
tfidf = TfidfVectorizer(max_df=0.9, min_df=1, max_features=1000, stop_words='english')
X = tfidf.fit_transform(corpus)
feature_names = tfidf.get_feature_names_out()
# Show top features by average tf-idf across docs
import numpy as np
avg_tfidf = np.asarray(X.mean(axis=0)).ravel()
top_idx = avg_tfidf.argsort()[::-1][:30]
pd.DataFrame({'term': feature_names[top_idx], 'avg_tfidf': avg_tfidf[top_idx]}).head(15)

In [None]:
# LDA topic modeling using CountVectorizer (small demo with 3 topics)
n_topics = 3
cv = CountVectorizer(max_df=0.95, min_df=1, max_features=1000, stop_words='english')
X_counts = cv.fit_transform(corpus)
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, learning_method='batch')
lda.fit(X_counts)

# show top words per topic
def print_top_words(model, feature_names, n_top_words=8):
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[::-1][:n_top_words]
        top_features = [feature_names[i] for i in top_features_ind]
        print(f'Topic {topic_idx}:', ' '.join(top_features))

print_top_words(lda, cv.get_feature_names_out(), n_top_words=10)

### Notes

- This is a minimal TF-IDF + LDA workflow for demonstration. For the full dataset: tune `min_df`, `max_df`, `n_components`, and consider using larger vocabulary, n-grams, and more robust preprocessing.
- For larger corpora, prefer incremental or online LDA, or use libraries like gensim for scalability.