In [6]:
import base64
import requests
import pandas as pd

from env import github_token, github_username
from acquire import search_github_repositories
from prepare import basic_clean, tokenize, remove_stopwords, preprocess_text_in_dataframe

[
    "awesome-artificial-intelligence",
    "Artificial-Intelligence",
    "opencog",
    "Artificial-Intelligence-Deep-Learning-Machine-Learning-Tutorials",
    "artificial-intelligence",
    "artificial-intelligence-for-trading",
    "Artificial-Intelligence-Terminology-Database",
    "stanford-cs-221-artificial-intelligence",
    "DataScience_ArtificialIntelligence_Utils",
    "mycroft-core",
    "Artificial-Intelligence-with-Python",
    "AI-Expert-Roadmap",
    "aifh",
    "Artificial-Intelligence-and-Machine-Learning",
    "simpleai",
    "awesome-artificial-intelligence-guidelines",
    "Artificial-Intelligence-Projects",
    "dreamtime",
    "snake",
    "Grokking-Artificial-Intelligence-Algorithms",
    "paip-lisp",
    "ruby-warrior",
    "warriorjs",
    "aiac",
    "Python-Artificial-Intelligence-Projects-for-Beginners",
    "machinelearning",
    "notes",
    "aima-python",
    "muzic"
]

In [2]:
search_query = "artificial intelligence"
repository_type = "repositories"
repositories = search_github_repositories(search_query, repository_type, per_page=100)


repo_data = []

for repo in repositories:
    repo_info = {
        "Name": repo["name"],
        "URL": repo["html_url"],
        "Description": repo["description"],
        "Readme": "",
    }

    if repo["has_wiki"]:
        readme_url = f"https://api.github.com/repos/{repo['full_name']}/readme"
        headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}
        response = requests.get(readme_url, headers=headers)

        if response.status_code == 200:
            readme_data = response.json()
            encoded_readme = readme_data.get("content", "")
            decoded_readme = base64.b64decode(encoded_readme).decode('utf-8')
            repo_info["Readme"] = decoded_readme

    repo_data.append(repo_info)

df = pd.DataFrame(repo_data)

In [5]:
df.head()

Unnamed: 0,Name,URL,Description,Readme
0,awesome-artificial-intelligence,https://github.com/owainlewis/awesome-artifici...,A curated list of Artificial Intelligence (AI)...,# Awesome Artificial Intelligence (AI) [![Awes...
1,Artificial-Intelligence,https://github.com/Niraj-Lunavat/Artificial-In...,"Awesome AI Learning with +100 AI Cheat-Sheets,...",All about AI with Cheat-Sheets(+100 Cheat-shee...
2,opencog,https://github.com/opencog/opencog,A framework for integrated Artificial Intellig...,
3,Artificial-Intelligence-Deep-Learning-Machine-...,https://github.com/TarrySingh/Artificial-Intel...,A comprehensive list of Deep Learning / Artifi...,# NEW LIST 2023 - 2024: Machine-Learning / Dee...
4,artificial-intelligence,https://github.com/udacity/artificial-intellig...,,# Artificial Intelligence Nanodegree Program R...


In [11]:
#convert all column names to lowercase
df.columns = [column.lower() for column in df.columns]

In [14]:
#apply the basic_clean function to the readme column
df['readme'] = df['readme'].apply(basic_clean)

In [15]:
#apply the tokenize function to the readme column
df['readme'] = df['readme'].apply(tokenize)

In [16]:
#apply remove_stopwords function to the readme column
df['readme'] = df['readme'].apply(remove_stopwords)

In [17]:
df.head()

Unnamed: 0,name,url,description,readme
0,awesome-artificial-intelligence,https://github.com/owainlewis/awesome-artifici...,A curated list of Artificial Intelligence (AI)...,awesome artificial intelligence ai awesomehttp...
1,Artificial-Intelligence,https://github.com/Niraj-Lunavat/Artificial-In...,"Awesome AI Learning with +100 AI Cheat-Sheets,...",ai cheatsheets100 cheatsheets free online book...
2,opencog,https://github.com/opencog/opencog,A framework for integrated Artificial Intellig...,
3,Artificial-Intelligence-Deep-Learning-Machine-...,https://github.com/TarrySingh/Artificial-Intel...,A comprehensive list of Deep Learning / Artifi...,new list 2023 2024 machinelearning deeplearnin...
4,artificial-intelligence,https://github.com/udacity/artificial-intellig...,,artificial intelligence nanodegree program res...


In [20]:
df.to_csv('gihub_repos.csv', index=False)