In [4]:
import pandas as pd
from habanero import Crossref
import re
from titlecase import titlecase
from gensim.summarization import keywords
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tabulate import tabulate
import tkinter as tk
from tkinter import simpledialog

# Download NLTK resources (if not already installed)
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLTK tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text, regex_patterns):
    for pattern in regex_patterns:
        if len(pattern) == 3:
            text = re.sub(pattern[0], pattern[1], str(text), flags=pattern[2])
        else:
            text = re.sub(pattern[0], pattern[1], str(text))
    return text

# Function to get papers data
def get_papers_data(keywords, from_date):
    cr = Crossref()
    res_new = cr.works(
        query=keywords,
        limit=30,
        filter={'has-abstract': True, 'from-pub-date': from_date},
        sort='relevance',
        type='journal-article'
    )
    return res_new['message']['items']

# Function to extract summary from abstract
def extract_summary(abstract):
    sentences = abstract.split('. ')
    return '. '.join(sentences[:3]) + '.' if len(sentences) > 2 else abstract

# Function to preprocess text
def preprocess_text(text):
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    # Lemmatize words
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return ' '.join(lemmatized_tokens)

# Function to extract keywords
def extract_keywords(text):
    # Preprocess text
    processed_text = preprocess_text(text)
    # Extract keywords
    key_words = keywords(processed_text).split('\n')
    return ', '.join(key_words) if key_words else 'N/A'

# Function to process papers data
def process_papers_data(papers_data):
    columns = ['Title', 'Date', 'Abstract_Summary', 'DOI', 'URL', 'Publisher', 'Cites count', 'Key Words']
    papers_dict = {col: [] for col in columns}
    
    for item in papers_data:
        papers_dict['Title'].append(item.get('title', [None])[0])
        papers_dict['Publisher'].append(item.get('publisher'))
        papers_dict['Abstract_Summary'].append(extract_summary(item.get('abstract', '')))
        
        date = item.get('published')
        if date and 'date-parts' in date:
            year = date['date-parts'][0][0] if date['date-parts'] and date['date-parts'][0] else None
            papers_dict['Date'].append(year)
        else:
            papers_dict['Date'].append(None)
            
        papers_dict['Cites count'].append(item.get('is-referenced-by-count'))
        papers_dict['URL'].append(item.get('URL'))
        papers_dict['DOI'].append(item.get('DOI'))
        papers_dict['Key Words'].append(extract_keywords(item.get('abstract', '')))
    
    df = pd.DataFrame(papers_dict)
    
    regex_patterns = [
        (r"[\[\]']", ""), 
        ("<i>", ""), 
        ("\t", ""),
        (r"[\[\]'{}]", ""), 
        ("date-parts:", ""),
        ("[\<\[].*?[\>\]]", ""),
        ("&#", ""),
        (r"\s+", " "),
        (r"summary", "", re.IGNORECASE)
    ]
    
    for col in ['Title', 'Abstract_Summary']:
        df[col] = df[col].apply(lambda x: clean_text(x, regex_patterns))
    
    df['Title'] = df['Title'].apply(lambda x: titlecase(x) if x else x)
    
    return df

def get_user_input():
    root = tk.Tk()
    root.withdraw()  # Hide the main window
    keywords = simpledialog.askstring("Input", "Enter keywords for search:")
    from_year = simpledialog.askstring("Input", "Enter the starting year for search (e.g., 2014):")
    root.destroy()
    return keywords, from_year  # return from_year as well

# Main execution
def main():
    keywords, from_year = get_user_input()  # receive from_year here
    from_date = f'{from_year}-01-01'
    papers_data = get_papers_data(keywords, from_date)
    df = process_papers_data(papers_data)
    
    df = df.dropna(subset=['Title', 'Abstract_Summary'])
    df = df.sort_values(by='Cites count', ascending=False)
    df.reset_index(drop=True, inplace=True)  # Reset index after sorting
    df = df[['Title', 'Date', 'Abstract_Summary', 'DOI', 'URL', 'Publisher', 'Cites count', 'Key Words']]
    
    filename = "_".join(keywords.split()[:2]) + '.txt'
    
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(f'Search words: {keywords}\n')
        file.write(f'Start date of search: {from_year}\n\n')
        
        for i, row in df.iterrows():
            file.write(f"{i + 1}. {row['Title']} ({row['Date']})\n")
            file.write(f"Cites count: {row['Cites count']}\n")
            file.write(f"{row['Abstract_Summary']}\n")
            file.write(f"keywords: {row['Key Words']}\n")
            file.write(f"url: {row['URL']}\n\n")
        
        #file.write('Results Table:\n')
        #file.write(df.to_string(index=False))
    print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False))
    
    print(f'Results written to {filename}')

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tapetrova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tapetrova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


+-------------------------------------------------------------------------------------------------------------------------------------------------------+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+-------------------------------------------------