# Imports

In [1]:
import scraper
import src.acquire
import prepare_r

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from requests import get
from bs4 import BeautifulSoup as BS

import re
import nltk

# Acquire

In [2]:
df = pd.read_json('data.json')

In [3]:
print(df.shape)
df.head()

(110, 3)


Unnamed: 0,repo,language,readme_contents
0,jenkins-docs/simple-java-maven-app,Java,# simple-java-maven-app\n\nThis repository is ...
1,Trinea/android-open-project,,Android 开源项目分类汇总，更全更新可见 [codekk.com](https://p...
2,trekhleb/javascript-algorithms,JavaScript,# Algoritmos y Estructuras de Datos en JavaScr...
3,udacity/fullstack-nanodegree-vm,Python,# Full Stack Web Developer Nanodegree program ...
4,scutan90/DeepLearning-500-questions,,# 禁止转载，禁止转载，禁止转载！\n# GitHub上非最新内容，最新内容请期待新书。\n...


# Prepare

In [4]:
df = prepare_r.prep_contents(df)

In [5]:
df.language.value_counts()

JavaScript          25
Python              14
Java                 9
TypeScript           6
C++                  5
HTML                 5
CSS                  5
Go                   4
C                    4
Jupyter Notebook     3
Shell                2
PHP                  2
C#                   2
Vue                  2
ApacheConf           1
R                    1
Groovy               1
Ruby                 1
Kotlin               1
Rust                 1
Dart                 1
Scala                1
Name: language, dtype: int64

In [6]:
df['language'] = df.language.replace(['PHP','Shell','Kotlin','Vue','ApacheConf','Jupyter Notebook','R','Groovy',\
                                      'Scala','Rust','C#','Dart','Ruby'], 'Other')

In [7]:
df = df[df.language != 'Other']

In [8]:
df.language.value_counts()

JavaScript    25
Python        14
Java           9
TypeScript     6
CSS            5
C++            5
HTML           5
C              4
Go             4
Name: language, dtype: int64

## Handling Null Values

In [9]:
df.shape

(91, 8)

In [10]:
df.dropna(inplace=True)

In [11]:
df.reset_index(inplace=True)

In [12]:
df.head()

Unnamed: 0,index,repo,language,readme_contents,original,normalized,stemmed,lemmatized,cleaned
0,0,jenkins-docs/simple-java-maven-app,Java,# simple-java-maven-app\n\nThis repository is ...,# simple-java-maven-app\n\nThis repository is ...,simple java maven app this repository for th...,simpl java maven app thi repositori for the bu...,simple java maven app this repository for the ...,simple java maven app repository build java ap...
1,2,trekhleb/javascript-algorithms,JavaScript,# Algoritmos y Estructuras de Datos en JavaScr...,# Algoritmos y Estructuras de Datos en JavaScr...,algoritmos estructuras datos javascript ...,algoritmo estructura dato javascript build sta...,algoritmos estructuras datos javascript build ...,algoritmos estructuras datos javascript build ...
2,3,udacity/fullstack-nanodegree-vm,Python,# Full Stack Web Developer Nanodegree program ...,# Full Stack Web Developer Nanodegree program ...,full stack web developer nanodegree program vi...,full stack web develop nanodegre program virtu...,full stack web developer nanodegree program vi...,full stack web developer nanodegree program vi...
3,6,ytdl-org/youtube-dl,Python,[![Build Status](https://travis-ci.org/ytdl-or...,[![Build Status](https://travis-ci.org/ytdl-or...,build status https travis org ytdl org you...,build statu http travi org ytdl org youtub svg...,build status http travis org ytdl org youtube ...,build status travis org ytdl org youtube svg b...
4,7,josephmisiti/awesome-machine-learning,Python,# Awesome Machine Learning [![Awesome](https:/...,# Awesome Machine Learning [![Awesome](https:/...,awesome machine learning awesome https c...,awesom machin learn awesom http cdn rawgit com...,awesome machine learning awesome http cdn rawg...,awesome machine learning awesome cdn rawgit co...


# Feature Engineering

## Word Count

In [13]:
df['readme_words'] = df['cleaned'].apply(lambda x: len(x.split()))

In [14]:
df.readme_words.isna().sum()

0

## Sentance Count

# Explore

Using the cleaned readme with the stop words removed, let's look at what the most re-occuring words are

# Modeling

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [16]:
tfidf = TfidfVectorizer()
sparse = tfidf.fit_transform(df.cleaned)
data = pd.DataFrame(sparse.todense(), columns=tfidf.get_feature_names())

In [17]:
data.head()

Unnamed: 0,aac,aafb,aan,aanzee,aardvark,aaugustin,ab,abacus,abajo,abbott,...,zuhao,zulip,zulipchat,zulko,zvcm,zybpzd,zygmuntz,zynga,zyszys,zzm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014141,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.006152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.001774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001774,0.0,0.0,0.0,0.0,0.0,0.019512,0.0,0.0,0.0


In [18]:
word_count = df[['readme_words']]

In [19]:
joined = pd.concat([data, word_count], axis=1)

In [20]:
joined.head()

Unnamed: 0,aac,aafb,aan,aanzee,aardvark,aaugustin,ab,abacus,abajo,abbott,...,zulip,zulipchat,zulko,zvcm,zybpzd,zygmuntz,zynga,zyszys,zzm,readme_words
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014141,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,815
3,0.006152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7716
4,0.0,0.001774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.019512,0.0,0.0,0.0,15003


In [21]:
X = joined
y = df.language

## Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

In [24]:
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lm = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lm.predict(X_train)
test['predicted'] = lm.predict(X_test)

In [25]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 31.25%
---
Confusion Matrix
actual      C  C++  CSS  Go  HTML  Java  JavaScript  Python  TypeScript
predicted                                                              
HTML        1    0    1   0     0     0           0       1           0
JavaScript  0    1    0   1     1     2           5       2           1
---
              precision    recall  f1-score   support

           C       0.00      0.00      0.00         1
         C++       0.00      0.00      0.00         1
         CSS       0.00      0.00      0.00         1
          Go       0.00      0.00      0.00         1
        HTML       0.00      0.00      0.00         1
        Java       0.00      0.00      0.00         2
  JavaScript       0.38      1.00      0.56         5
      Python       0.00      0.00      0.00         3
  TypeScript       0.00      0.00      0.00         1

    accuracy                           0.31        16
   macro avg       0.04      0.11      0.06        16
weighted avg       0

## Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

In [28]:
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

tree = DecisionTreeClassifier(max_depth=4).fit(X_train, y_train)

train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

In [29]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 31.25%
---
Confusion Matrix
actual      C  C++  CSS  Go  HTML  Java  JavaScript  Python  TypeScript
predicted                                                              
C++         0    1    0   0     0     0           0       1           0
CSS         1    0    1   1     0     1           4       1           1
Java        0    0    0   0     0     1           0       0           0
JavaScript  0    0    0   0     1     0           1       0           0
Python      0    0    0   0     0     0           0       1           0
---
              precision    recall  f1-score   support

           C       0.00      0.00      0.00         1
         C++       0.50      1.00      0.67         1
         CSS       0.10      1.00      0.18         1
          Go       0.00      0.00      0.00         1
        HTML       0.00      0.00      0.00         1
        Java       1.00      0.50      0.67         2
  JavaScript       0.50      0.20      0.29         5
      Python       1.

## KNN

In [30]:
from sklearn.neighbors import KNeighborsClassifier

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

In [32]:
train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

knn = KNeighborsClassifier(n_neighbors=y).fit(X_train, y_train)

train['predicted'] = knn.predict(X_train)
test['predicted'] = knn.predict(X_test)

In [33]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 25.00%
---
Confusion Matrix
actual      C  C++  CSS  Go  HTML  Java  JavaScript  Python  TypeScript
predicted                                                              
C           0    0    0   0     0     1           0       0           0
C++         0    0    0   1     0     0           0       0           0
HTML        0    0    1   0     1     0           1       0           0
JavaScript  1    1    0   0     0     1           2       2           1
Python      0    0    0   0     0     0           2       1           0
---
              precision    recall  f1-score   support

           C       0.00      0.00      0.00         1
         C++       0.00      0.00      0.00         1
         CSS       0.00      0.00      0.00         1
          Go       0.00      0.00      0.00         1
        HTML       0.33      1.00      0.50         1
        Java       0.00      0.00      0.00         2
  JavaScript       0.25      0.40      0.31         5
      Python       0.