In [1]:
import pandas as pd

# Read the CSV file into a pandas DataFrame
paper_df = pd.read_csv('paper_list.csv')

# Display the DataFrame (optional, can be removed if not needed)
paper_df.head()


Unnamed: 0,Author Name,Paper Name,Publisher,Year
0,"C. H. Lampert, et al.",Attribute-based classification for zero-shot v...,T-PAMI,2014
1,"Y. Xian, et al.",Latent embeddings for zero-shot classification,CVPR,2016
2,"E. Kodirov, et al.",Semantic Autoencoder for Zero-Shot Learning,CVPR,2017
3,"Y. Xian, et al.",Feature Generating Networks for Zero-Shot Lear...,CVPR,2018
4,"J. Snell, K. Swersky, and R. Zemel",Prototypical networks for few-shot learning,NIPS,2017


In [14]:
import os

# Specify the path to the subfolder for downloaded PDFs
subfolder_path = '/Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs'

used_papers = []
unused_papers = []

# Iterate over each row in the DataFrame
for index, row in paper_df.iterrows():
    paper_name = row['Paper Name']
    found = False  # Flag to track if the paper name was found

    # Iterate through all PDF files in the subfolder
    for pdf_file in os.listdir(subfolder_path):
        if pdf_file.endswith('.pdf'):
            pdf_file_path = os.path.join(subfolder_path, pdf_file)
            # print(pdf_file_path)
            from PyPDF2 import PdfReader

            # Read the PDF file
            with open(pdf_file_path, 'rb') as file:
                reader = PdfReader(file)
                # Get the text from the first page
                first_page_text = reader.pages[0].extract_text()

            # Check if the paper name is in the first page text
            if len(paper_name.split()) > 3:
                title_words = paper_name.split()
                match_count = sum(1 for word in title_words if word in first_page_text)
                if match_count / len(title_words) >= 0.8:
                    used_papers.append(paper_name)
                    found = True
                    new_file_path = os.path.join(subfolder_path, paper_name + '.pdf')
                    os.rename(pdf_file_path, new_file_path)
                    print(f"Renamed {pdf_file_path} to {new_file_path}.")
                    break  # Exit the loop once the paper is found
            else:
                if all(word in first_page_text for word in paper_name.split()):
                    used_papers.append(paper_name)
                    found = True
                    new_file_path = os.path.join(subfolder_path, paper_name + '.pdf')
                    os.rename(pdf_file_path, new_file_path)
                    print(f"Renamed {pdf_file_path} to {new_file_path}.")
                    break  # Exit the loop once the paper is found

    if not found:
        unused_papers.append(paper_name)

# Create a DataFrame to show used vs unused papers
# Determine the maximum length between used_papers and unused_papers
max_length = max(len(used_papers), len(unused_papers))

# Extend both lists to the maximum length by adding empty strings
used_papers.extend([''] * (max_length - len(used_papers)))
unused_papers.extend([''] * (max_length - len(unused_papers)))

summary_df = pd.DataFrame({
    'Used Papers': used_papers,
    'Unused Papers': unused_papers
})

# Display the summary DataFrame
# Check for duplicates in used and unused papers
duplicate_used = set([paper for paper in used_papers if used_papers.count(paper) > 1])
duplicate_unused = set([paper for paper in unused_papers if unused_papers.count(paper) > 1])

# Display the summary DataFrame

# Show duplicates if any
if duplicate_used:
    print("Duplicate Used Papers:", duplicate_used)
else:
    print("No duplicate used papers found.")

if duplicate_unused:
    print("Duplicate Unused Papers:", duplicate_unused)
else:
    print("No duplicate unused papers found.")

summary_df




Renamed /Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs/1703.03400v3.pdf to /Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs/Prototypical networks for few-shot learning.pdf.
Renamed /Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs/1606.04080v2.pdf to /Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs/Matching Networks for One Shot Learning.pdf.
Renamed /Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs/Prototypical networks for few-shot learning.pdf to /Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs/Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks.pdf.
Renamed /Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs/1711.06025v2.pdf to /Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs/Learning to Compare: Relation Network for Few-Shot Learning.pdf.
Renamed /Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs/1710.10196v3.pdf to /Users/Sloan/Desktop/Project_Des

Unnamed: 0,Used Papers,Unused Papers
0,Prototypical networks for few-shot learning,Attribute-based classification for zero-shot v...
1,Matching Networks for One Shot Learning,Latent embeddings for zero-shot classification
2,Model-Agnostic Meta-Learning for Fast Adaptati...,Semantic Autoencoder for Zero-Shot Learning
3,Learning to Compare: Relation Network for Few-...,Feature Generating Networks for Zero-Shot Lear...
4,Image-to-image translation with conditional ad...,Unpaired image-to-image translation using cycl...
5,A style-based generator architecture for gener...,Progressive Growing of GANs for Improved Quali...
6,Unsupervised deep embedding for clustering ana...,Stargan: Unified generative adversarial networ...
7,BERTopic: Neural topic modeling with a class-b...,Towards k-means-friendly spaces: Simultaneous ...
8,Learning without Forgetting,Spectralnet: Spectral clustering using deep ne...
9,Unsupervised Visual Domain Adaptation Using Su...,Grad-cam: Visual explanations from deep networ...


In [15]:
import PyPDF2

# Initialize a list to store PDF names and their lengths
pdf_info = []

# Iterate through each PDF file in the subfolder
for filename in os.listdir(subfolder_path):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(subfolder_path, filename)
        
        # Open the PDF file and count the number of pages
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(reader.pages)
            pdf_info.append((filename, num_pages))

# Calculate the total number of pages
total_pages = sum(num_pages for _, num_pages in pdf_info)

# Print the total number of pages
print(f"Total number of pages across all PDFs: {total_pages}")

# Create a DataFrame to display the PDF names and their lengths
pdf_summary_df = pd.DataFrame(pdf_info, columns=['PDF Name', 'Number of Pages'])

# Display the PDF summary DataFrame
pdf_summary_df


Total number of pages across all PDFs: 658


Unnamed: 0,PDF Name,Number of Pages
0,CyCADA: Cycle-Consistent Adversarial Domain Ad...,15
1,A Unified Approach to Interpreting Model Predi...,10
2,BERTopic: Neural topic modeling with a class-b...,10
3,2010.11929v2.pdf,22
4,1610.02391v4.pdf,23
5,Learning transferable visual models from natur...,48
6,Improving language understanding by generative...,12
7,Matching Networks for One Shot Learning.pdf,12
8,Instructpix2pix: Learning to follow image edit...,15
9,2106.09685v2.pdf,26


In [9]:
summary_df

Unnamed: 0,Used Papers,Unused Papers
0,Matching Networks for One Shot Learning,Attribute-based classification for zero-shot v...
1,Model-Agnostic Meta-Learning for Fast Adaptati...,Latent embeddings for zero-shot classification
2,Learning to Compare: Relation Network for Few-...,Semantic Autoencoder for Zero-Shot Learning
3,BERTopic: Neural topic modeling with a class-b...,Feature Generating Networks for Zero-Shot Lear...
4,Learning without Forgetting,Prototypical networks for few-shot learning
5,Unsupervised Visual Domain Adaptation Using Su...,Image-to-image translation with conditional ad...
6,A Simple Framework for Contrastive Learning of...,Unpaired image-to-image translation using cycl...
7,Momentum Contrast for Unsupervised Visual Repr...,A style-based generator architecture for gener...
8,,Progressive Growing of GANs for Improved Quali...
9,,Stargan: Unified generative adversarial networ...


In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initialize the Selenium WebDriver for Safari
driver = webdriver.Safari()

# Navigate to the arXiv website
driver.get("https://arxiv.org")



In [None]:
# Optionally, wait for the page to load and check for a specific element
try:
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.title')))
    print("Successfully navigated to arXiv.")
except Exception as e:
    print(f"Error navigating to arXiv: {e}")


In [6]:
import requests
import os

# Specify the path to the subfolder for downloaded PDFs
subfolder_path = '/Users/Sloan/Desktop/Project_Desktop/School/5.2/ML Papers/PDFs'

# Create the subfolder if it doesn't exist
if not os.path.exists(subfolder_path):
    os.makedirs(subfolder_path)

# Iterate over each row in the DataFrame
for index, row in paper_df.iterrows():
    paper_name = row['Paper Name']
    # Construct the arXiv API URL for the paper
    api_url = f"http://export.arxiv.org/api/query?search_query=title:{paper_name.replace(' ', '+')}&start=0&max_results=1"
    # print(api_url)
    # Make a request to the arXiv API
    response = requests.get(api_url)
    # print(response.text)
    if response.status_code == 200:
        # # Check if any entries were returned
        # if '<entry>' not in response.text:
        #     print(f"No papers found for {paper_name}.")
        #     continue
        
        # Parse the response to find the PDF link
        pdf_link = None
        if '<link rel="alternate"' in response.text:
            start_index = response.text.index('<link rel="alternate"') + len('<link rel="alternate" href="')
            end_index = response.text.index('"', start_index)
            pdf_link = response.text[start_index:end_index]
        
        print(pdf_link)
        if pdf_link:
            # Extract the filename from the PDF link
            filename = f"{paper_name}.pdf"
            save_path = os.path.join(subfolder_path, filename)
            
            # Download the PDF file
            pdf_response = requests.get(pdf_link)
            if pdf_response.status_code == 200:
                with open(save_path, "wb") as file:
                    file.write(pdf_response.content)
                print(f"Downloaded: {filename}")
            else:
                print(f"Failed to download PDF for {paper_name}. Status code: {pdf_response.status_code}")
        else:
            print(f"No PDF link found for {paper_name}.")
    else:
        print(f"Failed to fetch data for {paper_name}. Status code: {response.status_code}")


None
No PDF link found for Attribute-based classification for zero-shot visual object categorization.
None
No PDF link found for Latent embeddings for zero-shot classification.
None
No PDF link found for Semantic Autoencoder for Zero-Shot Learning.
None
No PDF link found for Feature Generating Networks for Zero-Shot Learning.
None
No PDF link found for Prototypical networks for few-shot learning.
None
No PDF link found for Matching Networks for One Shot Learning.
None
No PDF link found for Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks.
None
No PDF link found for Learning to Compare: Relation Network for Few-Shot Learning.
None
No PDF link found for Image-to-image translation with conditional adversarial networks.
None
No PDF link found for Unpaired image-to-image translation using cycle-consistent adversarial networks.
None
No PDF link found for A style-based generator architecture for generative adversarial networks.
None
No PDF link found for Progressive Growing o