In [1]:
import base64
import os
import pandas as pd
import re

from collections import defaultdict
from github import Github, Organization

from src.model import text_preprocess

# using username and password
token = os.environ.get("GITUBCTOKEN2")
# Github Enterprise with custom hostname
g = Github(base_url="https://github.ubc.ca/api/v3", login_or_token=token)
student_repos = g.search_repositories(query="students in:repo:owner/name+pushed>2020-02-09+user:MDS-2019-20")

def refresh_data():

    results = defaultdict(list)
    file_extensions = r".*(\.md|\.py|\.Rmd|.ipynb)$"
    limit = 2

    # get all content
    count = 1
    for repo in student_repos:
        print(repo)
        if "MDS-2019-20" not in repo.full_name:
            print("Not MDS 2019-20")
            continue
        contents = repo.get_contents("")
        while contents:
            file_content = contents.pop(0)
            if file_content.type == "dir":
                contents.extend(repo.get_contents(file_content.path))
            else:
    #             print(file_content)
    #             print(file_content.name)
                results["repo_name"].append(repo.name)
                results["repo_full_name"].append(repo.full_name)
                results["file_name"].append(file_content.name)
                results["size"].append(file_content.size)
                results["path"].append(file_content.path)
                results["url"].append(file_content.url)
                
                if file_content.size > 1000000 or not re.match(file_extensions, file_content.name):
                    results["encoding"].append("not read")
                    results["content"].append("not read")
                    continue
                    
                results["encoding"].append(file_content.encoding)
                results["content"].append(base64.b64decode(file_content.content))
                
        if count > limit:
            break
        else:
            count += 1

    return results

In [2]:
results = refresh_data()

Repository(full_name="yinghuag/DSCI_542_comm-arg_students")
Not MDS 2019-20
Repository(full_name="shaunsun/DSCI_511_prog-dsci_students")
Not MDS 2019-20
Repository(full_name="ubc-mds-2016/DSCI_511_prog-dsci_students")
Not MDS 2019-20
Repository(full_name="yujul/home")
Not MDS 2019-20
Repository(full_name="tarini24/DSCI_531_viz-1_students")
Not MDS 2019-20
Repository(full_name="cpsc221/site")
Not MDS 2019-20
Repository(full_name="ubc-mds-2016/DSCI_521_platforms-dsci_students")
Not MDS 2019-20
Repository(full_name="ubc-mds-2016/DSCI_542_comm-arg_students")
Not MDS 2019-20
Repository(full_name="ubc-mds-2016/DSCI_531_viz-1_students")
Not MDS 2019-20
Repository(full_name="ubc-mds-2016/DSCI_551_eda-dsci_students")
Not MDS 2019-20
Repository(full_name="alim1990/DSCI_532_viz-2_students")
Not MDS 2019-20
Repository(full_name="cemsinan/DSCI_552_stat-inf-1_students")
Not MDS 2019-20
Repository(full_name="ubc-mds-2016/DSCI_561_regr-1_students")
Not MDS 2019-20
Repository(full_name="ubc-mds-2016/DS

In [3]:
df = pd.DataFrame(data=results)
df

Unnamed: 0,repo_name,repo_full_name,file_name,size,path,url,encoding,content
0,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,README.md,11495,README.md,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,b'# DSCI 542: Communication and Argumentation\...
1,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,better_data_science_pipeline.png,26624,img/better_data_science_pipeline.png,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,not read
2,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,data_science_pipeline.png,31993,img/data_science_pipeline.png,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,not read
3,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,2019-09-11 Week 1 Lab.pdf,8104420,lab presentations and solutions/2019-09-11 Wee...,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,not read
4,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,2019-09-18 Week 2 Lab.pdf,8075310,lab presentations and solutions/2019-09-18 Wee...,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,not read
...,...,...,...,...,...,...,...,...
383,DSCI_523_data-wrangling_students,MDS-2019-20/DSCI_523_data-wrangling_students,nycflights13.csv,402497,labs/solutions/lab4/data/nycflights13.csv,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,not read
384,DSCI_523_data-wrangling_students,MDS-2019-20/DSCI_523_data-wrangling_students,Exercise 4.10-1.png,58737,labs/solutions/lab1/lab1_files/figure-gfm/Exer...,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,not read
385,DSCI_523_data-wrangling_students,MDS-2019-20/DSCI_523_data-wrangling_students,Exercise 4.10-2.png,60197,labs/solutions/lab1/lab1_files/figure-gfm/Exer...,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,not read
386,DSCI_523_data-wrangling_students,MDS-2019-20/DSCI_523_data-wrangling_students,Exercise 4.10-3.png,59912,labs/solutions/lab1/lab1_files/figure-gfm/Exer...,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,not read


In [5]:
df.loc[:,"content"]

0      b'# DSCI 542: Communication and Argumentation\...
1                                               not read
2                                               not read
3                                               not read
4                                               not read
                             ...                        
383                                             not read
384                                             not read
385                                             not read
386                                             not read
387                                             not read
Name: content, Length: 388, dtype: object

In [25]:
def test(x):
    if type(x) == str:
        return "it is a string"
    else:
        return x.decode("utf-8").lower()

In [26]:
# df["content_clean"] = 
df.loc[:,"content"].apply(test)

0      # dsci 542: communication and argumentation\n\...
1                                         it is a string
2                                         it is a string
3                                         it is a string
4                                         it is a string
                             ...                        
383                                       it is a string
384                                       it is a string
385                                       it is a string
386                                       it is a string
387                                       it is a string
Name: content, Length: 388, dtype: object

In [19]:
df.iloc[0, 8].decode("utf-8")

TypeError: descriptor 'decode' requires a 'bytes' object but received a 'str'

In [28]:
pd.read_csv("data/2020-03-01_student-repos.csv")

Unnamed: 0,repo_name,repo_full_name,file_name,size,path,url,encoding,content,content_clean
0,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,README.md,11495,README.md,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,base64,# dsci 542: communication and argumentation\n\...,dsci communication and argumentation todo da...
1,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,better_data_science_pipeline.png,26624,img/better_data_science_pipeline.png,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,better_data_science_pipeline.png,betterdatasciencepipelinepng
2,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,data_science_pipeline.png,31993,img/data_science_pipeline.png,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,data_science_pipeline.png,datasciencepipelinepng
3,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,2019-09-11 Week 1 Lab.pdf,8104420,lab presentations and solutions/2019-09-11 Wee...,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,2019-09-11 Week 1 Lab.pdf,week labpdf
4,DSCI_542_comm-arg_students,MDS-2019-20/DSCI_542_comm-arg_students,2019-09-18 Week 2 Lab.pdf,8075310,lab presentations and solutions/2019-09-18 Wee...,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,2019-09-18 Week 2 Lab.pdf,week labpdf
...,...,...,...,...,...,...,...,...,...
2015,DSCI_524_collab-sw-dev_students,MDS-2019-20/DSCI_524_collab-sw-dev_students,delete_branches.png,327746,lectures/img/delete_branches.png,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,delete_branches.png,deletebranchespng
2016,DSCI_524_collab-sw-dev_students,MDS-2019-20/DSCI_524_collab-sw-dev_students,github_kanban.png,429312,lectures/img/github_kanban.png,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,github_kanban.png,githubkanbanpng
2017,DSCI_524_collab-sw-dev_students,MDS-2019-20/DSCI_524_collab-sw-dev_students,readr-milestones.png,252847,lectures/img/readr-milestones.png,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,readr-milestones.png,readrmilestonespng
2018,DSCI_524_collab-sw-dev_students,MDS-2019-20/DSCI_524_collab-sw-dev_students,view_branches.png,251613,lectures/img/view_branches.png,https://github.ubc.ca/api/v3/repos/MDS-2019-20...,not read,view_branches.png,viewbranchespng
