In [21]:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd
import numpy as np
import re

import acquire
import prepare

import sklearn as sk
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB

random_state = 42

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

In [2]:
df = pd.read_json('data.json')
df = prepare.prep_repos(df)

In [3]:
df.head()

Unnamed: 0,repo,language,original,clean,stemmed,lemmatized,language_reduced
0,google/googletest,C++,# GoogleTest\n\n### Announcements\n\n#### Live...,googletest announcements live head googletest ...,googletest announc live head googletest follow...,googletest announcement live head googletest f...,Other
1,projectdiscovery/nuclei-templates,Python,"\n\n<h1 align=""center"">\nNuclei Templates\n</h...",h1 aligncenter nuclei templates h1 h4 aligncen...,h1 aligncent nuclei templat h1 h4 aligncenterc...,h1 aligncenter nucleus template h1 h4 aligncen...,Python
2,digitalocean/nginxconfig.io,JavaScript,[![GitHub stars](https://img.shields.io/github...,github stars https imgshieldsio github stars d...,github star http imgshieldsio github star digi...,github star http imgshieldsio github star digi...,JavaScript
3,flutter/flutter,Dart,# [![Flutter logo][]][flutter.dev]\n\n[![Build...,flutter logo flutterdev build status cirrus bu...,flutter logo flutterdev build statu cirru buil...,flutter logo flutterdev build status cirrus bu...,Other
4,PaddlePaddle/PaddleOCR,Python,"English | [简体中文](README_ch.md)\n\n<p align=""ce...",english readmechmd p aligncenter img src doc p...,english readmechmd p aligncent img src doc pad...,english readmechmd p aligncenter img src doc p...,Python


In [4]:
target = 'language_reduced'

In [5]:
train, validate, test = prepare.split_data(df, target)

train	 n = 64
validate n = 28
test	 n = 23


In [6]:
def run_baseline(train, 
                 model_number, 
                 model_results):
    
    # establish baseline predictions for train sample
    y_pred = pd.Series([train[target].mode()[0]]).repeat(len(train))
    
    # get model performance metrics
    
    # create dictionaries for each metric type for the train sample and 
    # append those dictionaries to the model_results df
    dct = {'model_number': 'baseline',
           'model_type': 'baseline',
           'sample_type': 'train',
           'accuracy': sk.metrics.accuracy_score(train[target], y_pred)}
    model_results = model_results.append(dct, ignore_index=True)
    
    # reset the model_number from 'baseline' to 0
    model_number = 0
    
    return model_number, model_results

In [7]:
model_number = 0 
model_results = pd.DataFrame()

model_number, model_results = run_baseline(train, 
                                           model_number, 
                                           model_results)

In [8]:
model_number

0

In [9]:
model_results

Unnamed: 0,model_number,model_type,sample_type,accuracy
0,baseline,baseline,train,0.65625


In [10]:
def run_decision_tree(train, validate, target,
                      model_number, model_results):
    
    # split into x and y
    x_train = train.lemmatized
    y_train = train[target]
    
    x_validate = validate.lemmatized
    y_validate = validate[target]
    
    min_max_depth = 3
    max_max_depth = 10
    
    for max_depth in range(min_max_depth, max_max_depth+1):
        
        
        
        # create classifier tree object
        tree = DecisionTreeClassifier(max_depth=max_depth)
       
        #################
        #### TF-IDF #####
        #################
        
        model_number += 1
        model_type = 'decision_tree'
        feature_type = 'TF-IDF'
        
        # create the model
        tfidf = TfidfVectorizer().fit(x_train)
        x_tfidf = tfidf.transform(x_train)
        tree.fit(x_tfidf, y_train)
        
        # store info about the model
        
        ####################
        ### train sample ###
        ####################
            
        # create a dictionary containing the features and hyperparameters
        # used in this model instance
        dct = {'model_number': model_number,
               'model_type': model_type,
               'sample_type': 'train',
               'feature_type': feature_type,
               'max_depth': max_depth,
               'accuracy': tree.score(tfidf.transform(x_train), y_train)}
        # append that dictionary to the model_results dataframe
        model_results = model_results.append(dct, ignore_index=True)
        
        #######################
        ### validate sample ###
        #######################
        
        # create a dictionary containing the features and hyperparameters
        # used in this model instance
        dct = {'model_number': model_number,
               'model_type': model_type,
               'sample_type': 'validate',
               'feature_type': feature_type,
               'max_depth': max_depth,
               'accuracy': tree.score(tfidf.transform(x_validate), y_validate)}
        # append that dictionary to the model_results dataframe
        model_results = model_results.append(dct, ignore_index=True)
        
        ##############
        ### CV/BOW ###
        ##############
        
        model_number += 1
        model_type = 'decision_tree'
        feature_type = 'CV/BOW'
        
        # create the model
        cv = CountVectorizer().fit(x_train)
        x_cv = cv.transform(x_train)
        tree.fit(x_cv, y_train)
        
        # store info about the model
        
        ####################
        ### train sample ###
        ####################
            
        # create a dictionary containing the features and hyperparameters
        # used in this model instance
        dct = {'model_number': model_number,
               'model_type': model_type,
               'sample_type': 'train',
               'feature_type': feature_type,
               'max_depth': max_depth,
               'accuracy': tree.score(cv.transform(x_train), y_train)}
        # append that dictionary to the model_results dataframe
        model_results = model_results.append(dct, ignore_index=True)
        
        #######################
        ### validate sample ###
        #######################
        
        # create a dictionary containing the features and hyperparameters
        # used in this model instance
        dct = {'model_number': model_number,
               'model_type': model_type,
               'sample_type': 'validate',
               'feature_type': feature_type,
               'max_depth': max_depth,
               'accuracy': tree.score(cv.transform(x_validate), y_validate)}
        # append that dictionary to the model_results dataframe
        model_results = model_results.append(dct, ignore_index=True)
        
        
    return model_number, model_results

In [11]:
model_number, model_results = run_decision_tree(train, validate, target,
                                                model_number, model_results)

In [12]:
model_results[model_results.sample_type == 'validate'].accuracy.max()

0.7142857142857143

In [13]:
def run_random_forest(train, validate, target,
                      model_number, model_results):
    
    # split into x and y
    x_train = train.lemmatized
    y_train = train[target]
    
    x_validate = validate.lemmatized
    y_validate = validate[target]
    
    # set hyperparameters
    min_max_depth = 3
    max_max_depth = 6
    min_min_samples_leaf = 3
    max_min_samples_leaf = 6
    
    for max_depth in range(min_max_depth, 
                           max_max_depth+1):
        for min_samples_leaf in range(min_min_samples_leaf, 
                                      max_min_samples_leaf+1):
            
            clf = RandomForestClassifier(min_samples_leaf=min_samples_leaf,
                                         max_depth=max_depth)
            
            ############
            ## TF-IDF ##
            ############
            
            model_number += 1
            model_type = 'random_forest'
            feature_type = 'TF-IDF'

            
            # create the model
            tfidf = TfidfVectorizer().fit(x_train)
            x_tfidf = tfidf.transform(x_train)
            clf.fit(x_tfidf, y_train)
            
            # store info about the model
            
            ####################
            ### train sample ###
            ####################
            
            # create a dictionary containing the features and hyperparameters
            # used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'sample_type': 'train',
                   'feature_type': feature_type,
                   'max_depth': max_depth,
                   'min_samples_leaf': min_samples_leaf,
                   'accuracy': clf.score(tfidf.transform(x_train), y_train)}
            # append that dictionary to the model_results dataframe
            model_results = model_results.append(dct, ignore_index=True)

            #######################
            ### validate sample ###
            #######################

            # create a dictionary containing the features and hyperparameters
            # used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'sample_type': 'validate',
                   'feature_type': feature_type,
                   'max_depth': max_depth,
                   'min_samples_leaf': min_samples_leaf,
                   'accuracy': clf.score(tfidf.transform(x_validate), y_validate)}
            # append that dictionary to the model_results dataframe
            model_results = model_results.append(dct, ignore_index=True)
            
            ##############
            ### CV/BOW ###
            ##############

            model_number += 1
            model_type = 'random_forest'
            feature_type = 'CV/BOW'

            # create the model
            cv = CountVectorizer().fit(x_train)
            x_cv = cv.transform(x_train)
            clf.fit(x_cv, y_train)

            # store info about the model

            ####################
            ### train sample ###
            ####################

            # create a dictionary containing the features and hyperparameters
            # used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'sample_type': 'train',
                   'feature_type': feature_type,
                   'max_depth': max_depth,
                   'min_samples_leaf': min_samples_leaf,
                   'accuracy': clf.score(cv.transform(x_train), y_train)}
            # append that dictionary to the model_results dataframe
            model_results = model_results.append(dct, ignore_index=True)

            #######################
            ### validate sample ###
            #######################

            # create a dictionary containing the features and hyperparameters
            # used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'sample_type': 'validate',
                   'feature_type': feature_type,
                   'max_depth': max_depth,
                   'min_samples_leaf': min_samples_leaf,
                   'accuracy': clf.score(cv.transform(x_validate), y_validate)}
            # append that dictionary to the model_results dataframe
            model_results = model_results.append(dct, ignore_index=True)
            
    return model_number, model_results

In [14]:
model_number, model_results = run_random_forest(train, validate, target,
                                               model_number, model_results)

In [44]:
def run_naive_bayes(train, validate, target,
                    model_number, model_results):
    
    # split into x and y
    x_train = train.lemmatized
    y_train = train[target]
    
    x_validate = validate.lemmatized
    y_validate = validate[target]
    
    # set hyperparameters
    for alpha in [.1, .5, 1, 1.5, 2]:
        for classifier, model_type in zip([MultinomialNB(alpha=alpha), ComplementNB(alpha=alpha)], 
                                          ['MultinomialNB', 'ComplementNB']):

            # create the model
            clf = classifier

            ############
            ## TF-IDF ##
            ############

            model_number += 1
            model_type = model_type
            feature_type = 'TF-IDF'

            # fit the model
            tfidf = TfidfVectorizer().fit(x_train)
            x_tfidf = tfidf.transform(x_train)
            clf.fit(x_tfidf, y_train)

            # store info about the model

            ####################
            ### train sample ###
            ####################

            # create a dictionary containing the features and hyperparameters
            # used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'sample_type': 'train',
                   'feature_type': feature_type,
                   'alpha': alpha,
                   'accuracy': clf.score(tfidf.transform(x_train), y_train)}
            # append that dictionary to the model_results dataframe
            model_results = model_results.append(dct, ignore_index=True)

            #######################
            ### validate sample ###
            #######################

            # create a dictionary containing the features and hyperparameters
            # used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'sample_type': 'validate',
                   'feature_type': feature_type,
                   'alpha': alpha,
                   'accuracy': clf.score(tfidf.transform(x_validate), y_validate)}
            # append that dictionary to the model_results dataframe
            model_results = model_results.append(dct, ignore_index=True)

            ##############
            ### CV/BOW ###
            ##############

            model_number += 1
            model_type = model_type
            feature_type = 'CV/BOW'

            # create the model
            cv = CountVectorizer().fit(x_train)
            x_cv = cv.transform(x_train)
            clf.fit(x_cv, y_train)

            # store info about the model

            ####################
            ### train sample ###
            ####################

            # create a dictionary containing the features and hyperparameters
            # used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'sample_type': 'train',
                   'feature_type': feature_type,
                   'alpha': alpha,
                   'accuracy': clf.score(cv.transform(x_train), y_train)}
            # append that dictionary to the model_results dataframe
            model_results = model_results.append(dct, ignore_index=True)

            #######################
            ### validate sample ###
            #######################

            # create a dictionary containing the features and hyperparameters
            # used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'sample_type': 'validate',
                   'feature_type': feature_type,
                   'alpha': alpha,
                   'accuracy': clf.score(cv.transform(x_validate), y_validate)}
            # append that dictionary to the model_results dataframe
            model_results = model_results.append(dct, ignore_index=True)
            
    return model_number, model_results

In [45]:
model_number, model_results = run_naive_bayes(train, validate, target,
                                                model_number, model_results)

In [18]:
def display(model_results):
    '''
    This function takes in the model_results dataframe. This is a dataframe in tidy data format 
    containing the following information for each model created in the project:
    - model number
    - sample type
    - feature_type
    - hypterparameter values
    - accuracy (the accuracy score for the given model and sample type)
    The function returns a pivot table of those values for easy comparison of models, metrics, and samples. 
    '''
    # create a pivot table of the model_results dataframe
    # establish columns as the model_number, with index grouped by metric_type then sample_type, and values as score
    # the aggfunc uses a lambda to return each individual score without any aggregation applied
    return model_results.pivot_table(columns=['model_number', 'model_type'], 
                                     index=('sample_type'), 
                                     values='accuracy',
                                     aggfunc=lambda x: x)

In [43]:
display_model_results(model_results).T

Unnamed: 0_level_0,sample_type,train,validate
model_number,model_type,Unnamed: 2_level_1,Unnamed: 3_level_1
1,decision_tree,0.8125,0.357143
2,decision_tree,0.765625,0.607143
3,decision_tree,0.890625,0.464286
4,decision_tree,0.875,0.428571
5,decision_tree,0.9375,0.571429
6,decision_tree,0.90625,0.714286
7,decision_tree,0.96875,0.535714
8,decision_tree,0.9375,0.642857
9,decision_tree,0.984375,0.535714
10,decision_tree,0.984375,0.535714


In [46]:
df

Unnamed: 0,repo,language,original,clean,stemmed,lemmatized,language_reduced
0,google/googletest,C++,# GoogleTest\n\n### Announcements\n\n#### Live...,googletest announcements live head googletest ...,googletest announc live head googletest follow...,googletest announcement live head googletest f...,Other
1,projectdiscovery/nuclei-templates,Python,"\n\n<h1 align=""center"">\nNuclei Templates\n</h...",h1 aligncenter nuclei templates h1 h4 aligncen...,h1 aligncent nuclei templat h1 h4 aligncenterc...,h1 aligncenter nucleus template h1 h4 aligncen...,Python
2,digitalocean/nginxconfig.io,JavaScript,[![GitHub stars](https://img.shields.io/github...,github stars https imgshieldsio github stars d...,github star http imgshieldsio github star digi...,github star http imgshieldsio github star digi...,JavaScript
3,flutter/flutter,Dart,# [![Flutter logo][]][flutter.dev]\n\n[![Build...,flutter logo flutterdev build status cirrus bu...,flutter logo flutterdev build statu cirru buil...,flutter logo flutterdev build status cirrus bu...,Other
4,PaddlePaddle/PaddleOCR,Python,"English | [简体中文](README_ch.md)\n\n<p align=""ce...",english readmechmd p aligncenter img src doc p...,english readmechmd p aligncent img src doc pad...,english readmechmd p aligncenter img src doc p...,Python
5,supabase/supabase,TypeScript,"<p align=""center"">\n<img width=""300"" src=""http...",p aligncenter img width300 srchttps rawgithubu...,p aligncent img width300 srchttp rawgithubuser...,p aligncenter img width300 srchttps rawgithubu...,Other
6,felipefialho/frontend-challenges,,"<p align=""center""><img src=""https://user-image...",p aligncenterimg srchttps userimagesgithubuser...,p aligncenterimg srchttp userimagesgithubuserc...,p aligncenterimg srchttps userimagesgithubuser...,Other
7,flutter/samples,Dart,# Flutter samples\n\n[![Build Status](https://...,flutter samples build status https githubcom f...,flutter sampl build statu http githubcom flutt...,flutter sample build status http githubcom flu...,Other
8,florinpop17/app-ideas,,## :ledger: App Ideas Collection\n\n![App Idea...,ledger app ideas collection app ideas image ap...,ledger app idea collect app idea imag appideas...,ledger app idea collection app idea image appi...,Other
9,charmbracelet/bubbletea,Go,"Bubble Tea\n==========\n\n<p>\n <img src=""h...",bubble tea p img srchttps stuffcharmsh bubblet...,bubbl tea p img srchttp stuffcharmsh bubbletea...,bubble tea p img srchttps stuffcharmsh bubblet...,Other
