In [None]:
# default_exp metrics_python

In [None]:
# export

import pandas as pd
from scipy.spatial import distance
from sklearn import preprocessing
import lizard
import radon
import pprint
from radon.raw import analyze
from radon.complexity import *
from radon.metrics import *

## metrics_python

> This module provides a tool for computing metrics (from static analysis) for python source code using Using <a href="https://pypi.org/project/radon/">radon package <a/>

Using <a href="https://pypi.org/project/radon/">radon package <a/>

In [None]:
# export

def compute_metrics(df_series):
    '''
    Computes metrics from source code
    
    Parameters:
    
    # df: Pandas dataframe containing source code column
    
    Returns:
    
    Tuple comprising:
    
    - Dataframe with computed metrics
    - List of records' indices for which metrics could not be computed
    
    '''
    
    init_data = {
        'sample': [],
        'loc':[],
        'lloc':[],
        'sloc': [],
        'comments':[],
        'multi': [],
        'blank':[],
        'single_comments':[],
        'h1': [],
        'h2': [],
        'N1': [],
        'N2': [],
        'vocabulary': [],
        'length': [],
        'calculated_length': [],
        'volume': [],
        'difficulty': [],
        'effort': [],
        'time': [],
        'bugs': [],
        'complexity': [],
        'maint_idx': [],
        'maint_idx_rank': []
    }
    
    # Empty DataFrame
    metrics_df = pd.DataFrame(init_data)
    
    problem_records_indices = []
    
    for idx, code in df_series.iteritems():
        try:
            # Computes available metrics if possible
            raw = analyze(code)
            halstead = h_visit(code)
            cc_met = cc_visit(code)
            maint_idx = mi_visit(code, False) # False indicates not to consider multi-line strings as comments
            maint_idx_rank = mi_rank(maint_idx)

            total_complexity = 0

            for func in cc_met:
                total_complexity += func.complexity

            new_row = {
                'sample': idx,
                'loc': raw.loc,
                'lloc': raw.lloc,
                'sloc': raw.sloc,
                'comments': raw.comments,
                'multi': raw.multi,
                'blank': raw.blank,
                'single_comments': raw.single_comments,
                'h1': halstead.total.h1,
                'h2': halstead.total.h2,
                'N1': halstead.total.N1,
                'N2': halstead.total.N2,
                'vocabulary': halstead.total.vocabulary,
                'length': halstead.total.length,
                'calculated_length': halstead.total.calculated_length,
                'volume': halstead.total.volume,
                'difficulty': halstead.total.difficulty,
                'effort': halstead.total.effort,
                'time': halstead.total.time,
                'bugs': halstead.total.bugs,
                'complexity': total_complexity,
                'maint_idx': maint_idx,
                'maint_idx_rank': maint_idx_rank
            }

            metrics_df = metrics_df.append(new_row, ignore_index=True)
            
        except:
            problem_records_indices.append(idx)
            
    if problem_records_indices:
        print(f'There was a problem computing metrics for {len(problem_records_indices)} records.')

    return metrics_df, problem_records_indices

In [None]:
# export

class PythonAnalyzer():
    """
    Class aimed to obtain metrics from a dataset of python source code records.
    Metrics computation is performed via an open source library.
    
    """
    
    def compute_metrics_for_df_series(self, df_series):
        """
        Computes metrics (static analysis) for a collection of source code records
        
        Params:
        # df_series: Pandas DF column containing source code records
        
        Returns:
        Tuple comprising
        
        - Pandas DataFrame with computed metrics
        - List of records' indices for which metrics could not be computed
        
        """
        
        return compute_metrics(df_series)
        
    
    def compute_and_save_metrics_for_df(self, df_series, destination_path):
        """
        Computes metrics (static analysis) for a pandas df column (series).
        Additionaly, exports metrics results to a csv file located at the specified path
        
        Params:
        # df_series: Pandas DF column containing source code records
        # destination_path: string indicating full path (including filename) for the exported file
        
        Returns:
        Tuple comprising
        
        - Pandas DataFrame with computed metrics
        - List of records' indices for which metrics could not be computed
        """
        
        metrics_df, error_indices = compute_metrics(df_series)
    
        metrics_df.to_csv(destination_path)
        
        return metrics_df, error_indices

Example

In [None]:
python_df = pd.read_csv('/tf/data/clean_python.csv')

In [None]:
# Explore DataFrame

python_df.head()

Unnamed: 0.1,Unnamed: 0,code,code_len,code_tokens,cyclomatic_complexity,data_type,method_name,nloc,parameter_count,partition,token_count
0,0,"def get_vid_from_url(url):\n """"""Extract...",53.0,"['def', 'get_vid_from_url', '(', 'url', ')', '...",6.0,src,get_vid_from_url,7.0,1.0,test,62.0
1,1,"def sina_xml_to_url_list(xml_data):\n """"""st...",52.0,"['def', 'sina_xml_to_url_list', '(', 'xml_data...",2.0,src,sina_xml_to_url_list,7.0,1.0,test,52.0
2,2,"def makeMimi(upid):\n """"""From http://cdn37....",30.0,"['def', 'makeMimi', '(', 'upid', ')', ':', 'st...",1.0,src,makeMimi,4.0,1.0,test,30.0
3,3,"def fc2video_download(url, output_dir = '.', m...",66.0,"['def', 'fc2video_download', '(', 'url', ',', ...",3.0,src,fc2video_download,6.0,5.0,test,62.0
4,4,"def dailymotion_download(url, output_dir='.', ...",150.0,"['def', 'dailymotion_download', '(', 'url', ',...",6.0,src,dailymotion_download,17.0,5.0,test,153.0


In [None]:
samples = python_df.sample(100)

In [None]:
metrics_df, error_indices = compute_metrics(samples['code'])
print(f'Error indices: {error_indices}')
error_records = samples.loc[error_indices, :]
print(f'Error records:\n{error_records}') 

There was a problem computing metrics for 5 records.
Error indices: [67892, 173843, 364457, 364612, 392728]
Error records:
        Unnamed: 0                                               code  \
67892        15716  def updateRouterStatus(self):\n        """forc...   
173843        1667  def connectionLost(self, reason):\n        """...   
364457       20103  def _validate_filters_ndb(cls, filters, model_...   
364612       20258  def _split_ns_by_scatter(cls,\n               ...   
392728       18374  def orientnii(imfile):\n    '''Get the orienta...   

        code_len                                        code_tokens  \
67892       60.0  ['def', 'updateRouterStatus', '(', 'self', ')'...   
173843      76.0  ['def', 'connectionLost', '(', 'self', ',', 'r...   
364457      80.0  ['def', '_validate_filters_ndb', '(', 'cls', '...   
364612     392.0  ['def', '_split_ns_by_scatter', '(', 'cls', ',...   
392728     108.0  ['def', 'orientnii', '(', 'imfile', ')', ':', ...   

        cyc

In [None]:
py_analyzer = PythonAnalyzer()
help(py_analyzer)

Help on PythonAnalyzer in module __main__ object:

class PythonAnalyzer(builtins.object)
 |  Class aimed to obtain metrics from a dataset of python source code records.
 |  Metrics computation is performed via an open source library.
 |  
 |  Methods defined here:
 |  
 |  compute_and_save_metrics_for_df(self, df_series, destination_path)
 |      Computes metrics (static analysis) for a pandas df column (series).
 |      Additionaly, exports metrics results to a csv file located at the specified path
 |      
 |      Params:
 |      # df_series: Pandas DF column containing source code records
 |      # destination_path: string indicating full path (including filename) for the exported file
 |      
 |      Returns:
 |      Tuple comprising
 |      
 |      - Pandas DataFrame with computed metrics
 |      - List of records' indices for which metrics could not be computed
 |  
 |  compute_metrics_for_df_series(self, df_series)
 |      Computes metrics (static analysis) for a collection o

In [None]:
metrics_df, error_indices = py_analyzer.compute_metrics_for_df_series(samples['code'])

samples.loc[error_indices, :]

error_records = samples.loc[error_indices, ['code']]

There was a problem computing metrics for 5 records.


In [None]:
error_records

Unnamed: 0,code
67892,"def updateRouterStatus(self):\n """"""forc..."
173843,"def connectionLost(self, reason):\n """"""..."
364457,"def _validate_filters_ndb(cls, filters, model_..."
364612,"def _split_ns_by_scatter(cls,\n ..."
392728,def orientnii(imfile):\n '''Get the orienta...


## Description of available metrics

In [None]:
metrics_df.columns

Index(['sample', 'loc', 'lloc', 'sloc', 'comments', 'multi', 'blank',
       'single_comments', 'h1', 'h2', 'N1', 'N2', 'vocabulary', 'length',
       'calculated_length', 'volume', 'difficulty', 'effort', 'time', 'bugs',
       'complexity', 'maint_idx', 'maint_idx_rank'],
      dtype='object')

Further information can be found at the <a href="https://radon.readthedocs.io/en/latest/"> documentation page </a>

Radon package groups the provided metrics into the following categories:

<ul>
    <li>Raw metrics</li> 
    <li>Cyclomatic Complexity (i.e. McCabe’s Complexity)</li>
    <li>Halstead metrics</li>
    <li>Maintainability Index</li>
</ul>    

#### Raw metrics

<ul>
    <li>
        loc: Total number of lines of code
    </li>
    <li>
        lloc: The number of logical lines of code. (Each logical line contains exactly one statement)
    </li>
    <li>
        sloc: Number of source lines of code. ()
    </li>
    <li>
        comments: Number of comment lines. (Single comment lines #, multi-line strings are not counted as comments but as strings)
    </li>
    <li>
        multi: Number of lines corresponding to multi-line strings
    </li>
    <li>
        blank: Number of blank lines
    </li>
</ul>

#### Cyclomatic complexity

<ul>
    <li>complexity: number of decisions a block of code contains plus 1</li>
</ul>

#### Halstead metrics

More detailed info. available at: <a href="https://radon.readthedocs.io/en/latest/intro.html#halstead-metrics">Radon documentation - Halstead metrics</a>
<ul>
    <li>h1: Number of distinct operators</li>
    <li>h2: Number of distintct operands</li>
    <li>N1: Total number of operators</li>
    <li>N2: Total number of operands</li>
    <li>vocabulary</li>
    <li>length</li>
    <li>calculated_length</li>
    <li>volume</li>
    <li>difficulty </li>
    <li>effort </li>
    <li>time</li>
    <li>bugs</li>
    
</ul>    

#### Maintainability index

Detailed formulation of the calculation can be found at: <a href="https://radon.readthedocs.io/en/latest/intro.html#maintainability-index">Radon documentation about maintainability index</a>

<ul>
    <li>maint_idx: Maintainability index score</li>
    <li>maint_idx_rank: Ranking given the maintainability index score </li>
<ul>

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 0.0_mgmnt.prep.i.ipynb.
Converted 0.1_mgmnt.prep.conv.ipynb.
Converted 0.3_mgmnt.prep.bpe.ipynb.
Converted 0.6_mgmnt.prep.nltk.ipynb.
Converted 0.7_metrics_module_python.ipynb.
Converted 1.0_exp.i.ipynb.
Converted 1.1_exp.info-[inspect].ipynb.
Converted 1.1_exp.info.ipynb.
Converted 1.2_exp.csnc.ipynb.
Converted 1.2_exp.gen.code.ipynb.
Converted 1.3_exp.csnc_python.ipynb.
Converted 2.0_repr.codebert.ipynb.
Converted 2.0_repr.i.ipynb.
Converted 2.1_repr.codeberta.ipynb.
Converted 2.1_repr.roberta.train.ipynb.
Converted 2.2_repr.roberta.eval.ipynb.
Converted 2.3_repr.word2vec.train.ipynb.
Converted 2.6_repr.word2vec.eval.ipynb.
Converted 2.7_repr.distmetrics.ipynb.
Converted 2.8_repr.sentence_transformers.ipynb.
Converted 3.1_mining.unsupervised.traceability.eda.ipynb.
Converted 3.2_mining.unsupervised.eda.traceability.d2v.ipynb.
This cell doesn't have an export destination and was ignored:
e
This cell doesn't have an export destination and was ignored:
e
This cell doesn't have