### For this project, we will be scraping data from GitHub repository README files. The goal will be to build a model that can predict what programming language a repository is, given the text of the README file.

#### Deliverables

1. A well-documented jupyter notebook that contains your analysis
2. Three or four google slides suitable for a general audience that summarize your findings. Include a well-labelled visualization in your slides.

In [1]:
from requests import get
from bs4 import BeautifulSoup
import os
import re
from env import github_token, github_username
import scraper2
import model
import pandas as pd
import json
import seaborn as sns
import time
import prepare_r
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from wordcloud import WordCloud
import nltk
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from scipy import stats
import acquire2

plt.rc('figure', figsize=(10, 6))
plt.rc('font', size=14)
plt.style.use('seaborn-darkgrid')

### Data Acquisition:
1. Acquire list of repos we want to analyze using the scraper.py
2. Use acquire.py (Zach's code) to scrape readme files from Github

In [15]:
# List of urls for github
g_url = ['https://github.com/search?q=stars%3A%3E0&s=stars&type=Repositories', ' https://github.com/search?p=2&q=stars%3A%3E0&s=stars&type=Repositories'\
         'https://github.com/search?p=3&q=stars%3A%3E0&s=stars&type=Repositories', 'https://github.com/search?p=4&q=stars%3A%3E0&s=stars&type=Repositories',\
         'https://github.com/search?p=5&q=stars%3A%3E0&s=stars&type=Repositories','https://github.com/search?p=6&q=stars%3A%3E0&s=stars&type=Repositories',\
        'https://github.com/search?p=7&q=stars%3A%3E0&s=stars&type=Repositories','https://github.com/search?p=8&q=stars%3A%3E0&s=stars&type=Repositories',\
        'https://github.com/search?p=9&q=stars%3A%3E0&s=stars&type=Repositories','https://github.com/search?p=10&q=stars%3A%3E0&s=stars&type=Repositories',
        'https://github.com/search?p=11&q=stars%3A%3E0&s=stars&type=Repositories', 'https://github.com/search?p=12&q=stars%3A%3E0&s=stars&type=Repositories',
        'https://github.com/search?p=13&q=stars%3A%3E0&s=stars&type=Repositories','https://github.com/search?p=14&q=stars%3A%3E0&s=stars&type=Repositories',
        'https://github.com/search?p=15&q=stars%3A%3E0&s=stars&type=Repositories','https://github.com/search?p=16&q=stars%3A%3E0&s=stars&type=Repositories',
        'https://github.com/search?p=17&q=stars%3A%3E0&s=stars&type=Repositories','https://github.com/search?p=18&q=stars%3A%3E0&s=stars&type=Repositories',
        'https://github.com/search?p=19&q=stars%3A%3E0&s=stars&type=Repositories','https://github.com/search?p=20&q=stars%3A%3E0&s=stars&type=Repositories',
        'https://github.com/search?p=21&q=stars%3A%3E0&s=stars&type=Repositories','https://github.com/search?p=22&q=stars%3A%3E0&s=stars&type=Repositories',
        'https://github.com/search?p=23&q=stars%3A%3E0&s=stars&type=Repositories','https://github.com/search?p=24&q=stars%3A%3E0&s=stars&type=Repositories',
        'https://github.com/search?p=25&q=stars%3A%3E0&s=stars&type=Repositories','https://github.com/search?p=26&q=stars%3A%3E0&s=stars&type=Repositories']

In [3]:
# g_url = ['https://github.com/search?o=desc&q=stars:%3E1&s=forks&type=Repositories']

In [16]:
# Scrape the name of github repos
repo_list = scraper2.get_repos(g_url)

In [18]:
#total number of repos scraped
len(repo_list)

250

In [19]:
# first five repos
repo_list [0:5]

['/freeCodeCamp/freeCodeCamp',
 '/996icu/996.ICU',
 '/vuejs/vue',
 '/EbookFoundation/free-programming-books',
 '/facebook/react']

In [20]:
# use regex to remove first slash from repo names
list1 = []
for repo in repo_list:
    repo1 = re.search(r'[a-z0-9A-Z].+', repo)[0]
    list1.append(repo1)
    

In [21]:
# Look at first five repo names
list1[0:5]

['freeCodeCamp/freeCodeCamp',
 '996icu/996.ICU',
 'vuejs/vue',
 'EbookFoundation/free-programming-books',
 'facebook/react']

In [22]:
list1

['freeCodeCamp/freeCodeCamp',
 '996icu/996.ICU',
 'vuejs/vue',
 'EbookFoundation/free-programming-books',
 'facebook/react',
 'tensorflow/tensorflow',
 'sindresorhus/awesome',
 'twbs/bootstrap',
 'jwasham/coding-interview-university',
 'kamranahmedse/developer-roadmap',
 'getify/You-Dont-Know-JS',
 'ohmyzsh/ohmyzsh',
 'CyC2018/CS-Notes',
 'donnemartin/system-design-primer',
 'github/gitignore',
 'flutter/flutter',
 'microsoft/vscode',
 'airbnb/javascript',
 'public-apis/public-apis',
 'torvalds/linux',
 'jlevy/the-art-of-command-line',
 'ytdl-org/youtube-dl',
 'axios/axios',
 'golang/go',
 'nodejs/node',
 'kubernetes/kubernetes',
 'justjavac/free-programming-books-zh_CN',
 'labuladong/fucking-algorithm',
 'microsoft/terminal',
 'denoland/deno',
 'ossu/computer-science',
 'animate-css/animate.css',
 'angular/angular',
 'tensorflow/models',
 'puppeteer/puppeteer',
 'microsoft/TypeScript',
 '30-seconds/30-seconds-of-code',
 'mrdoob/three.js',
 'ant-design/ant-design',
 'FortAwesome/Font-A

In [23]:
# The acquired data is read from data2.json and store in a dataframe 'df'
f = open('data3.json')
json_file = json.load(f)

df = pd.DataFrame(json_file)

In [24]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,freeCodeCamp/freeCodeCamp,JavaScript,![freeCodeCamp.org Social Banner](https://s3.a...
1,996icu/996.ICU,Rust,[996.ICU](https://996.icu/#/en_US)\n=======\n*...
2,vuejs/vue,JavaScript,"<p align=""center""><a href=""https://vuejs.org"" ..."
3,EbookFoundation/free-programming-books,,This page is available as an easy-to-read webs...
4,facebook/react,JavaScript,# [React](https://reactjs.org/) &middot; [![Gi...


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             250 non-null    object
 1   language         212 non-null    object
 2   readme_contents  250 non-null    object
dtypes: object(3)
memory usage: 6.0+ KB


In [26]:
# look at repos where language is 'None'
df[df.language.isnull()].head(5)

Unnamed: 0,repo,language,readme_contents
3,EbookFoundation/free-programming-books,,This page is available as an easy-to-read webs...
6,sindresorhus/awesome,,"<div align=""center"">\n\t<img width=""500"" heigh..."
8,jwasham/coding-interview-university,,# Coding Interview University\n\n> I originall...
9,kamranahmedse/developer-roadmap,,\n![Web Developer Roadmap - 2020](https://i.im...
10,getify/You-Dont-Know-JS,,# You Don't Know JS Yet (book series) - 2nd Ed...


In [27]:
df.language.value_counts()

JavaScript          74
Python              28
Java                17
TypeScript          16
Go                  16
C++                 12
C                    7
CSS                  5
Ruby                 5
Shell                4
HTML                 4
Swift                3
Vue                  3
Dockerfile           2
Dart                 2
Rust                 2
Kotlin               2
PHP                  2
Jupyter Notebook     1
SCSS                 1
Assembly             1
TeX                  1
C#                   1
Objective-C          1
Clojure              1
Vim script           1
Name: language, dtype: int64