In [53]:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer

import acquire
import prepare

import sklearn as sk
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

random_state = 42

In [54]:
df = pd.read_json('data.json')
df = prepare.prep_repos(df)

In [55]:
df['language_reduced'] = df.language.apply(lambda lang: lang if lang in ['JavaScript', 'HTML', 'Python'] else 'Other')

In [56]:
df.head()

Unnamed: 0,repo,language,original,clean,stemmed,lemmatized,language_reduced
0,google/googletest,C++,# GoogleTest\n\n### Announcements\n\n#### Live...,googletest announcements live head googletest ...,googletest announc live head googletest follow...,googletest announcement live head googletest f...,Other
1,projectdiscovery/nuclei-templates,Python,"\n\n<h1 align=""center"">\nNuclei Templates\n</h...",h1 aligncenter nuclei templates h1 h4 aligncen...,h1 aligncent nuclei templat h1 h4 aligncenterc...,h1 aligncenter nucleus template h1 h4 aligncen...,Python
2,digitalocean/nginxconfig.io,JavaScript,[![GitHub stars](https://img.shields.io/github...,github starshttpsimgshieldsiogithubstarsdigita...,github starshttpsimgshieldsiogithubstarsdigita...,github starshttpsimgshieldsiogithubstarsdigita...,JavaScript
3,flutter/flutter,Dart,# [![Flutter logo][]][flutter.dev]\n\n[![Build...,flutter logoflutterdev build status cirrusbuil...,flutter logoflutterdev build statu cirrusbuild...,flutter logoflutterdev build status cirrusbuil...,Other
4,PaddlePaddle/PaddleOCR,Python,"English | [简体中文](README_ch.md)\n\n<p align=""ce...",english readmechmd p aligncenter img srcdocpad...,english readmechmd p aligncent img srcdocpaddl...,english readmechmd p aligncenter img srcdocpad...,Python


In [57]:
corpus = 'lemmatized'
target = 'language_reduced'

In [58]:
x = df[corpus]
y = df[target]

x_train, y_train,\
x_validate, y_validate, \
x_test, y_test = prepare.train_validate_test_split(x, y)

train	 n = 60
train	 n = 60
validate n = 27
train	 n = 27
test	 n = 22
train	 n = 22


In [59]:
def run_baseline(y_train, 
                 y_validate, 
                 model_number, 
                 model_results):
    
    # establish baseline predictions for train sample
    y_pred = pd.Series([y_train.mode()[0]]).repeat(len(y_train))
    
    # get model performance metrics
    
    # create dictionaries for each metric type for the train sample and 
    # append those dictionaries to the model_results df
    dct = {'model_number': 'baseline',
           'model_type': 'baseline',
           'sample_type': 'train',
           'accuracy': sk.metrics.accuracy_score(y_train, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)
    
    # reset the model_number from 'baseline' to 0
    model_number = 0
    
    return model_number, model_results

In [60]:
model_number = 0 
model_results = pd.DataFrame()

model_number, model_results = run_baseline(y_train, 
                                           y_validate, 
                                           model_number, 
                                           model_results)

In [61]:
model_number

0

In [62]:
model_results

Unnamed: 0,model_number,model_type,sample_type,accuracy
0,baseline,baseline,train,0.65


In [63]:
def run_decision_tree(x_train, y_train, 
                      x_validate, y_validate,
                      model_number, model_results):
        
    min_max_depth = 3
    max_max_depth = 10
    
    
    for max_depth in range(min_max_depth, max_max_depth+1):
        
        model_number += 1
        model_type = 'decision_tree'
        
        tree = DecisionTreeClassifier(max_depth=max_depth)
       
        #################
        #### TF-IDF #####
        #################
        feature_type = 'TF-IDF'
        
        tfidf = TfidfVectorizer().fit(x_train)
        x_tfidf = tfidf.transform(x_train)
        tree.fit(x_tfidf, y_train)
        
        # store info about the model
        
        ####################
        ### train sample ###
        ####################
            
        # create a dictionary containing the features and hyperparameters
        # used in this model instance
        dct = {'model_number': model_number,
               'model_type': model_type,
               'sample_type': 'train',
               'feature_type': feature_type,
               'max_depth': max_depth,
               'accuracy': tree.score(tfidf.transform(x_train), y_train)}
        # append that dictionary to the model_results dataframe
        model_results = model_results.append(dct, ignore_index=True)
        
        #######################
        ### validate sample ###
        #######################
        
        # create a dictionary containing the features and hyperparameters
        # used in this model instance
        dct = {'model_number': model_number,
               'model_type': model_type,
               'sample_type': 'validate',
               'feature_type': feature_type,
               'max_depth': max_depth,
               'accuracy': tree.score(tfidf.transform(x_validate), y_validate)}
        # append that dictionary to the model_results dataframe
        model_results = model_results.append(dct, ignore_index=True)
        
    return model_number, model_results

In [64]:
model_number, model_results = run_decision_tree(x_train, y_train,
                                                x_validate, y_validate,
                                                model_number, model_results)

In [None]:
def 