In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from wordcloud import WordCloud

%matplotlib inline

# Load dataset
df = pd.read_csv("metadata.csv")
print(df.head())

# Explore dataset
print(df.shape)
print(df.info())
print(df.isnull().sum().head(20))

# Clean
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['year'] = df['publish_time'].dt.year
df = df.dropna(subset=['title', 'abstract'])
df['abstract_word_count'] = df['abstract'].apply(lambda x: len(str(x).split()))

# Visualization 1 - Publications per year
year_counts = df['year'].value_counts().sort_index()
plt.bar(year_counts.index, year_counts.values)
plt.title("Publications by Year")
plt.show()

# Visualization 2 - Top Journals
top_journals = df['journal'].value_counts().head(10)
sns.barplot(x=top_journals.values, y=top_journals.index)
plt.title("Top Journals")
plt.show()

# Visualization 3 - Word frequency
words = " ".join(df['title'].dropna()).lower()
words = re.findall(r'\b\w+\b', words)
common_words = Counter(words).most_common(20)
print("Top words:", common_words)

# Visualization 4 - WordCloud
wordcloud = WordCloud(width=800, height=400).generate(" ".join(words))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


                                               PK-   £å*]&Ë X  \
0  jO5Þ÷Y±öÎã´ÍÕ§ì÷CÝèªÎ½÷²¬wYCkõ¾Óiª6¾w¥Ú4óÎ+...  ¢³éN1   
1  !SÈ=ûÐÿ·ÿ&â6lýYLgóy´L¢p2V³Y:Ïu¨h5¯¢4Ñ...        NaN   
2  Ñ±ÉwE¬rCrâP`ë\ßð_4Ë´G#½5x'oÞþöËÿ&ê%+H )0...        NaN   
3                                                NaN        NaN   
4                                                NaN        NaN   

        ç«Ø¿¼xâ?xgE¨Ü[iÕ´®½rå%mvUy£ïb»\Õ[åí  
0  åç6«h*ÝfuMt%ÒuÓ§óÉRáñr»m¢ðH%òèqGo_<µÂ¿0d...  
1                                                NaN  
2                                                NaN  
3                                                NaN  
4                                                NaN  
(10000, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 

KeyError: 'publish_time'