## Packages

In [1]:
from requests import get
import numpy as np
from bs4 import BeautifulSoup
import bs4
import pandas as pd
import re
import requests
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
import prepare
import acquire

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

from env import github_token, github_username

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/samkeeler/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samkeeler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Acquire

In [2]:
''' Acquires the data via a saved csv, or if that is not present runs the scrape_github_data function '''

df = acquire.get_repo_data(cached = True)

## Prepare

In [3]:
''' 
Takes in readme contents and applies the make_prepped_columns() function. Drops altered readme 
columns with the exception of the lemmatized one (as that's what I'll be working with) and the original.
Also gets rid of rows containing null readme or null language columns. Drops rows if their respective language 
appears less than twice (can't split it). Removes stopwords.
'''

df = prepare.prep_repos(df)

'''
Adds a feature that searches the readme for mentions of a specific coding language and extracts it, then puts that 
language into the "languages_in_readme" column. Also adds a feature for "readme_length"
'''

df = prepare.add_language_dummies_and_length_feature(df)

## Split

In [4]:
# Splits the data into train, validate, and test

train, validate, test = prepare.split(df, stratify_by = 'language')

## Explore

In [5]:
# Seeing how often each language appears in the dataset

df.language.value_counts()

Python              21
Java                20
C++                 16
PHP                 15
JavaScript          13
Shell               13
C#                  11
HTML                10
C                   10
Ruby                 9
Jupyter Notebook     4
TypeScript           3
Go                   3
Name: language, dtype: int64

In [6]:
# Finding the top words from all of the readme's combined

the_words = ' '.join(df.readme_contents_clean)
all_freq = pd.Series(the_words.split(' ')).value_counts()
all_freq.head(20)

default     405
install     207
code        194
new         192
package     188
function    163
run         148
see         143
data        138
release     138
need        137
project     137
node        136
public      135
user        134
newrelic    133
set         130
name        128
source      127
example     126
dtype: int64

In [7]:
# Viewing the rows which were able to find coding languages contained within the readme (although "go"
# is questionable on whether it is being used as a coding language)

df[df.languages_in_readme.notnull()]

Unnamed: 0,repo,language,readme_contents_clean,readme_length,languages_in_readme,has_python,has_php,has_html,has_typescript,has_ruby,has_shell,has_c++,has_java,has_javascript,has_go
5,WPO-Foundation/webpagetest,PHP,webpagetesttravishttpsimgshieldsiotraviswpofou...,1100,"' python ', ' php '",1,1,0,0,0,0,0,0,0,0
7,angular/material-start,JavaScript,angularjs materialstart es6gitterhttpsbadgesgi...,4240,"' html ', ' typescript'",0,0,1,1,0,0,0,0,0,0
8,transmission/transmission,C,abouttransmission fast easy free bittorrent cl...,1818,' shell ',0,0,0,0,0,1,0,0,0,0
10,tegon/clone-org-repos,JavaScript,clone github organization reposnpmhttpsnodeico...,2562,' ruby ',0,0,0,0,1,0,0,0,0,0
13,geotools/geotools,Java,geotools logogeotoolslogopnggeotoolshttpgeotoo...,1252,' java ',0,0,0,0,0,0,0,1,0,0
17,DSpace/xoai,Java,xoaiwhat xoaixoai powerful flexible oaipmh jav...,784,' java ',0,0,0,0,0,0,0,1,0,0
19,cpina/github-action-push-to-another-repository,Shell,githubactionpushtoanotherrepositorywhen github...,2916,' go ',0,0,0,0,0,0,0,0,0,1
28,offensive-security/exploitdb,C,exploit database git repositorythis official e...,5755,' python ',1,0,0,0,0,0,0,0,0,0
29,Kitware/CMake,C,cmakeintroductioncmake crossplatform opensourc...,2554,' c++ ',0,0,0,0,0,0,1,0,0,0
30,helm/chart-releaser,Go,chart releaserlicensehttpsimgshieldsiobadgelic...,5170,"' java ', ' go '",0,0,0,0,0,0,0,1,0,1


In [8]:
# Viewing the top words of the three most frequent coding languages in the dataset

python_words = ' '.join(train[train.language == 'Python'].readme_contents_clean)
java_words = ' '.join(train[train.language == 'Java'].readme_contents_clean)
cplusplus_words = ' '.join(train[train.language == 'C++'].readme_contents_clean)

python_freq = pd.Series(python_words.split(' ')).value_counts()
java_freq = pd.Series(java_words.split(' ')).value_counts()
cplusplus_freq = pd.Series(cplusplus_words.split(' ')).value_counts()

print('Popular Words in Python'), print(python_freq.head()), print(f"\n"), print('Popular Words in Java'),
print(java_freq.head()), print("\n"),print('Popular Words in C++'), print(cplusplus_freq.head())

Popular Words in Python
module    67
code      33
depth     31
python    30
new       30
dtype: int64


Popular Words in Java
server     40
nexus      28
project    28
public     26
update     23
dtype: int64


Popular Words in C++
cmake      29
release    24
library    22
project    22
install    21
dtype: int64


(None, None, None, None)

In [9]:
# Viewing the average readme length grouped by coding language

train.groupby('language').readme_length.mean().sort_values()

language
Shell                717.857143
Jupyter Notebook    1311.000000
TypeScript          1549.000000
HTML                2071.333333
PHP                 2165.500000
JavaScript          2471.714286
C#                  2754.500000
C                   2806.833333
Python              2824.833333
C++                 3206.222222
Ruby                3480.000000
Java                3935.181818
Go                  4902.000000
Name: readme_length, dtype: float64

## Prep for Modeling

In [10]:
# Splits the data into train, validate, and test
df.drop(columns = ['languages_in_readme', 'repo'], inplace = True)
train, validate, test = prepare.split(df, stratify_by = 'language')

In [11]:
# Splitting from target variable for creating models

X_train = train.drop(columns = ['language'])
X_validate = validate.drop(columns = ['language'])
X_test = test.drop(columns = ['language'])

In [12]:
# Creating target variable groups for creating models

y_train = train.language
y_validate = validate.language
y_test = test.language

In [13]:
# Creating a vectorizer object 

tfidf = TfidfVectorizer()

# Fitting that object onto the train data

tfidf.fit(X_train.readme_contents_clean)

# Applying the vector transformer to each data set

X_train_vectorized = tfidf.transform(X_train.readme_contents_clean)
X_validate_vectorized = tfidf.transform(X_validate.readme_contents_clean)
X_test_vectorized = tfidf.transform(X_test.readme_contents_clean)

## Logistic Regression

In [14]:
# Creating and fitting the logistic regression model

lm = LogisticRegression()
lm.fit(X_train_vectorized, y_train)

LogisticRegression()

In [15]:
# Creating a dataframe that will hold predicted and actual values for evaluation metrics

train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [16]:
# Applying and evaluating the logistic regression model

train['predicted_logreg'] = lm.predict(X_train_vectorized)
validate["predicted_logreg"] = lm.predict(X_validate_vectorized)
print('Train:', (train.actual == train.predicted_logreg).mean()), print('Validate:', (validate.actual == validate.predicted_logreg).mean())

Train: 0.9390243902439024
Validate: 0.3055555555555556


(None, None)

## Gaussian Naive Bayes

In [17]:
# Creating and fitting the naive bayes model

gnb = GaussianNB()
gnb.fit(X_train_vectorized.toarray(), y_train)

GaussianNB()

In [22]:
# Applying and evaluating the logistic regression model

train['predicted_gnb'] = gnb.predict(X_train_vectorized.toarray())
validate['predicted_gnb'] = gnb.predict(X_validate_vectorized.toarray())
print('Train:', (train.actual == train.predicted_gnb).mean()), print('Validate:', (validate.actual == validate.predicted_gnb).mean())

Train: 1.0
Validate: 0.3333333333333333


(None, None)