In [50]:
# !pip install beautifulsoup4 requests selenium

In [45]:
import requests
from bs4 import BeautifulSoup
import os

def scrape_github_repo(repo_url):
    
    visited_links = set()
    file_structure = []
    
    
    def dfs(folder_url):
        # Maintaining a variable to avoid duplicate calls
        if folder_url in visited_links:
            return
        visited_links.add(folder_url)
        
        print(folder_url)
        folder_response = requests.get(folder_url)
        if folder_response.status_code == 200:
            folder_html = folder_response.content
            folder_soup = BeautifulSoup(folder_html, 'html.parser')
            folder_links = folder_soup.find_all('a', class_='Link--primary')
            
#             print(folder_soup)

            for link in folder_links:
                name = link.get_text()
                if '.' in name:  
                    file_structure.append(name)
                else:  
#                     dfs(os.path.join(folder_url, name))
                    dfs(folder_url+'/'+name)


    # Get the HTML content of the repository page
    #     repo_url = repo_url.replace('https://', '').replace('http://', '')
    response = requests.get(repo_url)
    if response.status_code == 200:
        visited_links.add(repo_url)
        
        html_content = response.content
        soup = BeautifulSoup(html_content, 'html.parser')
        
#         print(soup)
        
        # Find all file and folder links
        file_links = soup.find_all('a', class_='Link--primary')
        
#         print(file_links)

        # Start DFS from the root of the repository
        for link in file_links:
            
            name = link.get_text()
#             print(name)
            if '.' in name: 
                file_structure.append(name)
            else: 
#                 dfs(os.path.join(repo_url,'tree/master', name))
                dfs(repo_url + '/tree/master/' + name)
                break

        return file_structure

    else:
        print("Failed to retrieve repository content. Please check the URL and try again.")
        return None

# Example usage
repo_url = "https://github.com/ultralytics/yolov5"
file_structure = scrape_github_repo(repo_url)

# if file_structure:
#     with open('github_file_structure.txt', 'w') as f:
#         for item in file_structure:
#             f.write("%s\n" % item)
#     print("GitHub repository file structure saved to github_file_structure.txt")


https://github.com/ultralytics/yolov5/tree/master/classify


In [41]:
print(file_structure)

['.github', '.github']


## Now Selenium!!
- When you fetch the page content using the requests library, you're getting the initial HTML structure without the dynamically rendered content.
- To scrape such dynamically rendered content, you'll need to use a tool that can execute JavaScript, such as Selenium WebDriver, which can automate web browser interactions.

In [42]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless")
options.add_argument("--profile-directory=Default")

def scrape_github_repo(repo_url):
    visited_links = set()
    file_structure = {}

    # Initialize Chrome WebDriver
    service = Service("C:/Users/vikas/Downloads/chromedriver-win64/chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)

    def dfs(folder_url, indent, file, file_names, folder_names):
        # Maintaining a variable to avoid duplicate calls
        if folder_url in visited_links:
            return
        visited_links.add(folder_url)

        driver.get(folder_url)
        page_source = driver.page_source
        folder_soup = BeautifulSoup(page_source, 'html.parser')
        folder_links = folder_soup.select('a.Link--primary:not([class*=" "])')

        for link in folder_links[::2]:
            name = link.get_text()
            
            print("| " * indent + "- " + name)
#             file.write("| " * indent + "- " + name + "\n")
#             if '.' not in name:
#                 dfs(folder_url + '/' + name, indent+1, file)
            
            if '.' in name:
                file_names.append(name)
            else:
                folder_names.append(name)
                file_structure[name] = {'files': [], 'folders': []}
                dfs(folder_url + '/' + name, indent + 1, file, file_structure[name]['files'], file_structure[name]['folders'])

    # Get the HTML content of the repository page
    driver.get(repo_url)
    visited_links.add(repo_url)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    file_links = soup.select('a.Link--primary:not([class*=" "])')
    
    file_structure['root'] = {'files': [], 'folders': []}
    
    # Start DFS from the root of the repository.
    with open('file_structure.txt', 'w') as f:
        for link in file_links[::2]:
            name = link.get_text()
            print("- " + name)
#             f.write("- " + name + "\n")
            if '.' in name: 
                file_structure['root']['files'].append(name)
            else:
                file_structure['root']['folders'].append(name)
                file_structure[name] = {'files': [], 'folders': []}
                dfs(repo_url + '/tree/master/' + name, 1, f, file_structure[name]['files'], file_structure[name]['folders'])

    # Close the WebDriver
    
    return file_structure
    
    driver.quit()
    
# Example usage
# repo_url = "https://github.com/Sidd-R/attendance_app"
repo_url = "https://github.com/Vikas-Rajpurohit/Chat-Bot"
# repo_url = "https://github.com/Vikas-Rajpurohit/ML13-RNN-LSTM-Projects"
file_str = scrape_github_repo(repo_url)

- mobile_app
| - .bundle
| - __tests__
| | - App.test.tsx
| - android
| | - app
| | | - src
| | | | - debug
| | | | | - AndroidManifest.xml
| | | | - main
| | | | | - java/com/attendanceapp
| | | | | | - MainActivity.kt
| | | | | | - MainApplication.kt
| | | | | - res
| | | | | | - drawable
| | | | | | | - rn_edit_text_material.xml
| | | | | | | - splash.png
| | | | | | - layout
| | | | | | | - launch_screen.xml
| | | | | | - mipmap-hdpi
| | | | | | | - ic_launcher.png
| | | | | | | - ic_launcher_round.png
| | | | | | - mipmap-mdpi
| | | | | | | - ic_launcher.png
| | | | | | | - ic_launcher_round.png
| | | | | | - mipmap-xhdpi
| | | | | | | - ic_launcher.png
| | | | | | | - ic_launcher_round.png
| | | | | | - mipmap-xxhdpi
| | | | | | | - ic_launcher.png
| | | | | | | - ic_launcher_round.png
| | | | | | - mipmap-xxxhdpi
| | | | | | | - ic_launcher.png
| | | | | | | - ic_launcher_round.png
| | | | | | - values
| | | | | | | - colors.xml
| | | | | | | - strings.xml
| | | | | | | - styles

In [44]:
file_str

{'root': {'files': [], 'folders': ['mobile_app', 'server']},
 'mobile_app': {'files': ['.bundle',
   '.env',
   '.eslintrc.js',
   '.gitignore',
   '.prettierrc.js',
   '.watchmanconfig',
   'README.md',
   'app.json',
   'babel.config.js',
   'index.js',
   'jest.config.js',
   'metro.config.js',
   'package.json',
   'tsconfig.json'],
  'folders': ['__tests__', 'android', 'ios', 'src', 'types', 'Gemfile']},
 '__tests__': {'files': ['StyledText-test.js'], 'folders': []},
 'android': {'files': ['build.gradle',
   'gradle.properties',
   'gradlew.bat',
   'settings.gradle'],
  'folders': ['app', 'gradle/wrapper', 'gradlew']},
 'app': {'files': ['build.gradle', 'debug.keystore', 'proguard-rules.pro'],
  'folders': ['src']},
 'src': {'files': ['App.tsx'],
  'folders': ['assets',
   'components',
   'functions',
   'navigation',
   'screens',
   'utils']},
 'debug': {'files': ['AndroidManifest.xml'], 'folders': []},
 'main': {'files': ['AndroidManifest.xml'],
  'folders': ['java/com/attend

## Description for folders!

In [54]:
import os
from langchain.llms import GooglePalm
from IPython.display import Markdown
from langchain.prompts import PromptTemplate

api_key = 'AIzaSyB5MJwoZfpm3PLNLOrhhbzhuVYw0GvTXEQ'

llm = GooglePalm(google_api_key=api_key, temperature=0.2)

prompt = PromptTemplate.from_template("""Generate a concise description for a folder containing items like
{folder}

Description: """)

In [31]:
msg = prompt.format(folder="['.github', '.dockerignore', '.gitattributes', '.gitignore', 'CITATION.cff', 'CONTRIBUTING.md', 'README.md', 'README.zh-CN.md', 'benchmarks.py', 'detect.py', 'export.py', 'hubconf.py', 'pyproject.toml', 'requirements.txt', 'train.py', 'tutorial.ipynb', 'val.py', 'classify', 'data', 'models', 'segment', 'utils', 'LICENSE']")

response = llm.invoke(msg)

display(Markdown(response))

This folder contains the source code, config files, and training/evaluation scripts for mmdetection, including classify, data, models, segment, utils.

In [45]:
for name in file_str.keys():
    data = file_str[name]
    merged_names = data['files'] + data['folders']
    
    msg = prompt.format(folder=merged_names)

    response = llm.invoke(msg)
    
    print(f'One line description for folder {name}\n{response}\n\n')


One line description for folder root
Contains mobile app and server code


One line description for folder mobile_app
Contains React Native project files and folders, including android, ios, src, and types.


One line description for folder __tests__
Unit tests for StyledText component


One line description for folder android
Android Studio project with Gradle build files and app folder


One line description for folder app
Android app source code with build files


One line description for folder src
Contains all the source code for the React Native app, including the app's entry point, assets, components, functions, navigation, screens, and utils.


One line description for folder debug
Contains the AndroidManifest.xml file


One line description for folder main
Android app project with Java source code and resource files


One line description for folder java/com/attendanceapp
Kotlin source code for the main activity and application


One line description for folder res
Contains la

In [48]:
# Example usage
repo_url = "https://github.com/1SatyamJaiswal/event-ease"
# repo_url = "https://github.com/Vikas-Rajpurohit/ML13-RNN-LSTM-Projects"
file_str = scrape_github_repo(repo_url)

- client
| - public
| | - images
| | | - art-exhibition.jpg
| | | - college-fest.jpg
| | | - datahack-hackathon.jpg
| | | - garba-night.jpg
| | - pfps
| | | - 1.jpg
| | | - 2.jpg
| | | - 3.jpg
| | | - 4.jpg
| | | - 5.jpg
| | - next.svg
| | - vercel.svg
| - src
| | - app
| | | - auth
| | | | - page.tsx
| | | - events
| | | | - [id]
| | | | | - page.tsx
| | | | - page.tsx
| | | - profile
| | | | - page.tsx
| | | - favicon.ico
| | | - globals.css
| | | - layout.tsx
| | | - page.tsx
| | - components
| | | - Events.tsx
| | | - EventsList.tsx
| | | - EventsListYour.tsx
| | | - Footer.tsx
| | | - Navbar.tsx
| | - global.d.ts
| - .gitignore
| - README.md
| - next.config.js
| - package-lock.json
| - package.json
| - postcss.config.js
| - tailwind.config.ts
| - tsconfig.json
- server
| - config
| | - db.js
| | - secret.js
| - controllers
| | - event-controller.js
| | - registration-controller.js
| | - user-conroller.js
| - model
| | - Event.js
| | - Registration.js
| | - User.js
| - node_modules

| | | - LICENSE
| | | - README.md
| | | - README.md~
| | | - index.js
| | | - package.json
| | - content-disposition
| | | - HISTORY.md
| | | - LICENSE
| | | - README.md
| | | - index.js
| | | - package.json
| | - content-type
| | | - HISTORY.md
| | | - LICENSE
| | | - README.md
| | | - index.js
| | | - package.json
| | - cookie-signature
| | | - .npmignore
| | | - History.md
| | | - Readme.md
| | | - index.js
| | | - package.json
| | - cookie
| | | - HISTORY.md
| | | - LICENSE
| | | - README.md
| | | - SECURITY.md
| | | - index.js
| | | - package.json
| | - cors
| | | - lib
| | | | - index.js
| | | - CONTRIBUTING.md
| | | - HISTORY.md
| | | - LICENSE
| | | - README.md
| | | - package.json
| | - debug
| | | - src
| | | | - browser.js
| | | | - debug.js
| | | | - index.js
| | | | - inspector-log.js
| | | | - node.js
| | | - .coveralls.yml
| | | - .eslintrc
| | | - .npmignore
| | | - .travis.yml
| | | - CHANGELOG.md
| | | - LICENSE
| | | - Makefile
| | | - README.md
| | | - component.jso

KeyboardInterrupt: 

In [None]:
for name in file_str.keys():
    data = file_str[name]
    merged_names = data['files'] + data['folders']
    
    msg = prompt.format(folder=merged_names)

    response = llm.invoke(msg)
    
    print(f'One line description for folder {name}\n{response}\n\n')

In [7]:
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless")
options.add_argument("--profile-directory=Default")

def scrape_github_repo(repo_url):
    visited_links = set()
    file_structure = {}

    # Initialize Chrome WebDriver
    service = Service("C:/Users/vikas/Downloads/chromedriver-win64/chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=options)

    def dfs(folder_url, indent, file, file_names, folder_names):
        # Maintaining a variable to avoid duplicate calls
        if folder_url in visited_links:
            return
        visited_links.add(folder_url)

        driver.get(folder_url)
        page_source = driver.page_source
        folder_soup = BeautifulSoup(page_source, 'html.parser')
        folder_links = folder_soup.select('a.Link--primary:not([class*=" "])')

        for link in folder_links[::2]:
            name = link.get_text()
            
            print("| " * indent + "- " + name)
#             file.write("| " * indent + "- " + name + "\n")
#             if '.' not in name:
#                 dfs(folder_url + '/' + name, indent+1, file)
            
            if '.' in name:
                file_names.append(name)
            else:
                folder_names.append(name)
                file_structure[name] = {'files': [], 'folders': []}
                dfs(folder_url + '/' + name, indent + 1, file, file_structure[name]['files'], file_structure[name]['folders'])

    # Get the HTML content of the repository page
    driver.get(repo_url)
    visited_links.add(repo_url)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    file_links = soup.select('a.Link--primary:not([class*=" "])')
    
    file_structure['root'] = {'files': [], 'folders': []}
    
    # Start DFS from the root of the repository.
    with open('file_structure.txt', 'w') as f:
        for link in file_links[::2]:
            name = link.get_text()
            print("- " + name)
#             f.write("- " + name + "\n")
            if '.' in name: 
                file_structure['root']['files'].append(name)
            else:
                file_structure['root']['folders'].append(name)
                file_structure[name] = {'files': [], 'folders': []}
                dfs(repo_url + '/tree/master/' + name, 1, f, file_structure[name]['files'], file_structure[name]['folders'])

    # Close the WebDriver
    
    return file_structure
    
    driver.quit()
    
# Example usage
# repo_url = "https://github.com/Sidd-R/attendance_app"
repo_url = "https://github.com/Vikas-Rajpurohit/Plant_Disease"
start = time.time()

# repo_url = "https://github.com/Vikas-Rajpurohit/ML13-RNN-LSTM-Projects"
file_str = scrape_github_repo(repo_url)

end = time.time()

print(f'\nTotal time for File Structure: {round(end-start, 2)} secs')

- images
| - Apple
| | - apple_black.jpeg
| | - apple_rust.jpeg
| | - apple_scab.jpeg
| - Bell Pepper
| | - Healthy.jpeg
| | - Spot.jpeg
| - Citrus
| | - CitrusBlackSpot(105).png
| | - CitrusCanker(100).png
| | - CitrusGreening(100).jpeg
| | - CitrusHealthy(11).png
| - Corn
| | - CornGrayLeafSpot(1).JPG
| | - CornHealthy(104).jpg
| | - CornNorthernLeafBlight(63).JPG
| - Grape
| | - GrapeBlackRot(1004).jpeg
| | - GrapeHealthy(1004).jpeg
| | - GrapeIsariopsisLeafSpot(1011).jpeg
- models
| - apple.h5
| - bellpepper.h5
| - citrus.h5
| - corn.h5
| - grape.h5
- .gitignore
- Crop Disease.ipynb
- MobileNetV2.ipynb
- app.py

Total time for File Structure: 10.13 secs


In [2]:
file_str

{'root': {'files': ['.gitignore',
   'Crop Disease.ipynb',
   'MobileNetV2.ipynb',
   'app.py'],
  'folders': ['images', 'models']},
 'images': {'files': [],
  'folders': ['Apple', 'Bell Pepper', 'Citrus', 'Corn', 'Grape']},
 'Apple': {'files': ['apple_black.jpeg', 'apple_rust.jpeg', 'apple_scab.jpeg'],
  'folders': []},
 'Bell Pepper': {'files': ['Healthy.jpeg', 'Spot.jpeg'], 'folders': []},
 'Citrus': {'files': ['CitrusBlackSpot(105).png',
   'CitrusCanker(100).png',
   'CitrusGreening(100).jpeg',
   'CitrusHealthy(11).png'],
  'folders': []},
 'Corn': {'files': ['CornGrayLeafSpot(1).JPG',
   'CornHealthy(104).jpg',
   'CornNorthernLeafBlight(63).JPG'],
  'folders': []},
 'Grape': {'files': ['GrapeBlackRot(1004).jpeg',
   'GrapeHealthy(1004).jpeg',
   'GrapeIsariopsisLeafSpot(1011).jpeg'],
  'folders': []},
 'models': {'files': ['apple.h5',
   'bellpepper.h5',
   'citrus.h5',
   'corn.h5',
   'grape.h5'],
  'folders': []}}

In [55]:
for name in file_str.keys():
    data = file_str[name]
    merged_names = data['files'] + data['folders']
    
    msg = prompt.format(folder=merged_names)

    response = llm.invoke(msg)
    
    print(f'One line description for folder {name}\n{response}\n\n')


One line description for folder root
Contains code, images, and models for a plant disease detection app


One line description for folder images
Fruits and vegetables


One line description for folder Apple
3 images of apples with different diseases


One line description for folder Bell Pepper
Pictures of a cat


One line description for folder Citrus
4 images of citrus diseases and healthy citrus


One line description for folder Corn
Images of corn with different diseases


One line description for folder Grape
3 images of grapes with different diseases


One line description for folder models
5 food items in h5 format


