In [106]:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd
import numpy as np
import re

import acquire
import prepare

import sklearn as sk
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

random_state = 42

pd.set_option('display.max_rows', 100)

In [91]:
df = pd.read_json('data.json')
df = prepare.prep_repos(df)

In [92]:
df.head()

Unnamed: 0,repo,language,original,clean,stemmed,lemmatized,language_reduced
0,google/googletest,C++,# GoogleTest\n\n### Announcements\n\n#### Live...,googletest announcements live head googletest ...,googletest announc live head googletest follow...,googletest announcement live head googletest f...,Other
1,projectdiscovery/nuclei-templates,Python,"\n\n<h1 align=""center"">\nNuclei Templates\n</h...",h1 aligncenter nuclei templates h1 h4 aligncen...,h1 aligncent nuclei templat h1 h4 aligncenterc...,h1 aligncenter nucleus template h1 h4 aligncen...,Python
2,digitalocean/nginxconfig.io,JavaScript,[![GitHub stars](https://img.shields.io/github...,github stars https imgshieldsio github stars d...,github star http imgshieldsio github star digi...,github star http imgshieldsio github star digi...,JavaScript
3,flutter/flutter,Dart,# [![Flutter logo][]][flutter.dev]\n\n[![Build...,flutter logo flutterdev build status cirrus bu...,flutter logo flutterdev build statu cirru buil...,flutter logo flutterdev build status cirrus bu...,Other
4,PaddlePaddle/PaddleOCR,Python,"English | [简体中文](README_ch.md)\n\n<p align=""ce...",english readmechmd p aligncenter img src doc p...,english readmechmd p aligncent img src doc pad...,english readmechmd p aligncenter img src doc p...,Python


In [93]:
target = 'language_reduced'

In [94]:
train, validate, test = prepare.split_data(df, target)

train	 n = 60
validate n = 27
test	 n = 22


In [95]:
def run_baseline(train, 
                 model_number, 
                 model_results):
    
    # establish baseline predictions for train sample
    y_pred = pd.Series([train[target].mode()[0]]).repeat(len(train))
    
    # get model performance metrics
    
    # create dictionaries for each metric type for the train sample and 
    # append those dictionaries to the model_results df
    dct = {'model_number': 'baseline',
           'model_type': 'baseline',
           'sample_type': 'train',
           'accuracy': sk.metrics.accuracy_score(train[target], y_pred)}
    model_results = model_results.append(dct, ignore_index=True)
    
    # reset the model_number from 'baseline' to 0
    model_number = 0
    
    return model_number, model_results

In [96]:
model_number = 0 
model_results = pd.DataFrame()

model_number, model_results = run_baseline(train, 
                                           model_number, 
                                           model_results)

In [97]:
model_number

0

In [98]:
model_results

Unnamed: 0,model_number,model_type,sample_type,accuracy
0,baseline,baseline,train,0.666667


In [99]:
def run_decision_tree(train, validate, target,
                      model_number, model_results):
    
    # split into x and y
    x_train = train.lemmatized
    y_train = train[target]
    
    x_validate = validate.lemmatized
    y_validate = validate[target]
    
    min_max_depth = 3
    max_max_depth = 10
    
    for max_depth in range(min_max_depth, max_max_depth+1):
        
        
        
        # create classifier tree object
        tree = DecisionTreeClassifier(max_depth=max_depth)
       
        #################
        #### TF-IDF #####
        #################
        
        model_number += 1
        model_type = 'decision_tree'
        feature_type = 'TF-IDF'
        
        # create the model
        tfidf = TfidfVectorizer().fit(x_train)
        x_tfidf = tfidf.transform(x_train)
        tree.fit(x_tfidf, y_train)
        
        # store info about the model
        
        ####################
        ### train sample ###
        ####################
            
        # create a dictionary containing the features and hyperparameters
        # used in this model instance
        dct = {'model_number': model_number,
               'model_type': model_type,
               'sample_type': 'train',
               'feature_type': feature_type,
               'max_depth': max_depth,
               'accuracy': tree.score(tfidf.transform(x_train), y_train)}
        # append that dictionary to the model_results dataframe
        model_results = model_results.append(dct, ignore_index=True)
        
        #######################
        ### validate sample ###
        #######################
        
        # create a dictionary containing the features and hyperparameters
        # used in this model instance
        dct = {'model_number': model_number,
               'model_type': model_type,
               'sample_type': 'validate',
               'feature_type': feature_type,
               'max_depth': max_depth,
               'accuracy': tree.score(tfidf.transform(x_validate), y_validate)}
        # append that dictionary to the model_results dataframe
        model_results = model_results.append(dct, ignore_index=True)
        
        ##############
        ### CV/BOW ###
        ##############
        
        model_number += 1
        model_type = 'decision_tree'
        feature_type = 'CV/BOW'
        
        # create the model
        cv = CountVectorizer().fit(x_train)
        x_cv = cv.transform(x_train)
        tree.fit(x_cv, y_train)
        
        # store info about the model
        
        ####################
        ### train sample ###
        ####################
            
        # create a dictionary containing the features and hyperparameters
        # used in this model instance
        dct = {'model_number': model_number,
               'model_type': model_type,
               'sample_type': 'train',
               'feature_type': feature_type,
               'max_depth': max_depth,
               'accuracy': tree.score(cv.transform(x_train), y_train)}
        # append that dictionary to the model_results dataframe
        model_results = model_results.append(dct, ignore_index=True)
        
        #######################
        ### validate sample ###
        #######################
        
        # create a dictionary containing the features and hyperparameters
        # used in this model instance
        dct = {'model_number': model_number,
               'model_type': model_type,
               'sample_type': 'validate',
               'feature_type': feature_type,
               'max_depth': max_depth,
               'accuracy': tree.score(cv.transform(x_validate), y_validate)}
        # append that dictionary to the model_results dataframe
        model_results = model_results.append(dct, ignore_index=True)
        
        
    return model_number, model_results

In [100]:
model_number, model_results = run_decision_tree(train, validate, target,
                                                model_number, model_results)

In [101]:
model_results[model_results.sample_type == 'validate'].accuracy.max()

0.7037037037037037

In [102]:
def run_random_forest(train, validate, target,
                      model_number, model_results):
    
    # split into x and y
    x_train = train.lemmatized
    y_train = train[target]
    
    x_validate = validate.lemmatized
    y_validate = validate[target]
    
    # set hyperparameters
    min_max_depth = 3
    max_max_depth = 6
    min_min_samples_leaf = 3
    max_min_samples_leaf = 6
    
    for max_depth in range(min_max_depth, 
                           max_max_depth+1):
        for min_samples_leaf in range(min_min_samples_leaf, 
                                      max_min_samples_leaf+1):
            
            clf = RandomForestClassifier(min_samples_leaf=min_samples_leaf,
                                         max_depth=max_depth)
            
            ############
            ## TF-IDF ##
            ############
            
            model_number += 1
            model_type = 'random_forest'
            feature_type = 'TF-IDF'
            
            # create the model
            tfidf = TfidfVectorizer().fit(x_train)
            x_tfidf = tfidf.transform(x_train)
            clf.fit(x_tfidf, y_train)
            
            # store info about the model
            
            ####################
            ### train sample ###
            ####################
            
            # create a dictionary containing the features and hyperparameters
            # used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'sample_type': 'train',
                   'feature_type': feature_type,
                   'max_depth': max_depth,
                   'min_samples_leaf': min_samples_leaf,
                   'accuracy': clf.score(tfidf.transform(x_train), y_train)}
            # append that dictionary to the model_results dataframe
            model_results = model_results.append(dct, ignore_index=True)

            #######################
            ### validate sample ###
            #######################

            # create a dictionary containing the features and hyperparameters
            # used in this model instance
            dct = {'model_number': model_number,
                   'model_type': model_type,
                   'sample_type': 'validate',
                   'feature_type': feature_type,
                   'max_depth': max_depth,
                   'min_samples_leaf': min_samples_leaf,
                   'accuracy': clf.score(tfidf.transform(x_validate), y_validate)}
            # append that dictionary to the model_results dataframe
            model_results = model_results.append(dct, ignore_index=True)
            
    return model_number, model_results

In [103]:
model_number, model_results = run_random_forest(train, validate, target,
                                               model_number, model_results)

In [107]:
model_results

Unnamed: 0,model_number,model_type,sample_type,accuracy,feature_type,max_depth,min_samples_leaf
0,baseline,baseline,train,0.666667,,,
1,1,decision_tree,train,0.833333,TF-IDF,3.0,
2,1,decision_tree,validate,0.703704,TF-IDF,3.0,
3,2,decision_tree,train,0.783333,CV/BOW,3.0,
4,2,decision_tree,validate,0.62963,CV/BOW,3.0,
5,3,decision_tree,train,0.883333,TF-IDF,4.0,
6,3,decision_tree,validate,0.703704,TF-IDF,4.0,
7,4,decision_tree,train,0.8,CV/BOW,4.0,
8,4,decision_tree,validate,0.481481,CV/BOW,4.0,
9,5,decision_tree,train,0.883333,TF-IDF,5.0,
