In [None]:
!pip install radon

Collecting radon
  Downloading radon-6.0.1-py2.py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mando<0.8,>=0.6 (from radon)
  Downloading mando-0.7.1-py2.py3-none-any.whl (28 kB)
Collecting colorama>=0.4.1 (from radon)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: mando, colorama, radon
Successfully installed colorama-0.4.6 mando-0.7.1 radon-6.0.1
Downloading...
From: https://drive.google.com/uc?id=1bs3NLKmO88X6rW54CJNX43Uyk0LIZqgU
To: /content/full_dataset_final.csv
100% 57.5M/57.5M [00:00<00:00, 132MB/s]


In [None]:
from radon.raw import analyze
from radon.complexity import cc_visit
from radon.metrics import h_visit
from radon.metrics import mi_parameters
from radon.metrics import mi_compute
from html.parser import HTMLParser
import re
import math
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
ds = pd.read_csv("dataset.csv")

In [None]:
class SnippetsGetter(HTMLParser):
    def __init__(self):
        self.s = []
        self.pre_flag = False
        self.code_flag = False
        super().__init__()

    def refresh_status(self):
        self.s = []
        self.pre_flag = False
        self.code_flag = False

    def get_status(self):
        return self.s

    def handle_starttag(self, tag, attrs):
        if tag == 'pre':
            self.pre_flag = True
        if tag == 'code':
            self.code_flag = True

    def handle_endtag(self, tag):
        if tag == 'pre':
            self.pre_flag = False
        if tag == 'code':
            self.code_flag = False

    def handle_data(self, data):
        if self.pre_flag and self.code_flag:
            self.s.append(data)

In [None]:
def get_code_statistics(code_snippet: str):
    res = {}

    basic_metrics = analyze(code_snippet)._asdict()
    res["CodeLines"] = basic_metrics["loc"]
    res["LogicalCodeLines"] = basic_metrics["lloc"]
    res["SourceCodeLines"] = basic_metrics["sloc"]
    res["Multilines"] = basic_metrics["multi"]
    res["NumberOfFunctions"] = len(cc_visit(code_snippet))

    combined_metrics = h_visit(code_snippet).total._asdict()
    res["DistinctOperators"] = combined_metrics["h1"]
    res["DistinctOperands"] = combined_metrics["h2"]
    res["TotalOperators"] = combined_metrics["N1"]
    res["TotalOperands"] = combined_metrics["N2"]
    res["Vocabulary"] = combined_metrics["vocabulary"] # h1 + h2
    res["LogicalLength"] = combined_metrics["length"] # N1 + N2
    res["CalculatedLength"] = combined_metrics["h1"] * np.log2(np.clip(combined_metrics["h1"], 1, np.inf)) +\
                                combined_metrics["h2"] * np.log2(np.clip(combined_metrics["h2"], 1, np.inf))
    res["Volume"] = combined_metrics["length"] * np.log2(np.clip(combined_metrics["vocabulary"], 1, np.inf)) # N * log2(h)
    res["Difficulty"] = combined_metrics["h1"] * combined_metrics["N2"] / (2 * np.clip(combined_metrics["h2"], 1, np.inf)) # h1 / 2 * N2 / h2
    res["Effort"] = res["Difficulty"] * res["Volume"] # Difficulty * Volume
    mi_params = mi_parameters(code_snippet)
    res["CyclomaticComplexity"] = mi_params[1]
    res["MaintainabilityIndex"] = (
                                    171
                                    - 5.2 * np.log(np.clip(mi_params[0], 1, np.inf))
                                    - 0.23 * mi_params[1]
                                    - 16.2 * np.log(np.clip(mi_params[2], 1, np.inf))
                                    + 50 * np.sin(np.sqrt(2.46 * np.clip(mi_params[3], 0, np.inf)))
                                ) / 171

    return res

In [None]:
sg = SnippetsGetter()
code_statistics = []
for i in tqdm(range(ds.shape[0])):
    code_statistics.append([ds.iloc[i].AnswerId, {
        'CodeLines': [],
        'LogicalCodeLines': 0,
        'SourceCodeLines': 0,
        'Multilines': 0,
        'NumberOfFunctions': 0,
        'CyclomaticComplexity': 0,
        'DistinctOperators': 0,
        'DistinctOperands': 0,
        'TotalOperators': 0,
        'TotalOperands': 0,
        'Vocabulary': 0,
        'LogicalLength': 0,
        'CalculatedLength': 0,
        'Volume': 0,
        'Difficulty': 0,
        'Effort': 0,
        'MaintainabilityIndex': []
    }])
    cnt = 0
    try:
        sg.refresh_status()
        sg.feed(ds.iloc[i].Answer)
        snippets = sg.get_status()
        snippets = [
            "\n".join([
                    line if re.compile("print [^(]").search(line) is None
                            else re.sub("print[^(].*", "print(" + line.split("print")[1] + ")", line)
                    for line in s.split('\n')
                ])
            for s in snippets
        ]
        for s in snippets:
            try:
                snippet_statistics = get_code_statistics(s)
                for field in code_statistics[-1][1].keys():
                    if field in ('CodeLines', 'MaintainabilityIndex'):
                        code_statistics[-1][1][field].append(snippet_statistics[field])
                    else:
                        code_statistics[-1][1][field] += snippet_statistics[field]
                cnt += 1
            except: # interactive-type code
                try:
                    s = "\n".join([re.sub(">>> |\.\.\. ", "", t) for t in s.split('\n') if re.search("^[>>>|...]", t)])
                    snippet_statistics = get_code_statistics(s)
                    for field in code_statistics[-1][1].keys():
                        if field in ('CodeLines', 'MaintainabilityIndex'):
                            code_statistics[-1][1][field].append(snippet_statistics[field])
                        else:
                            code_statistics[-1][1][field] += snippet_statistics[field]
                    cnt += 1
                except:
                    pass
    except:
        pass
    if np.sum(code_statistics[-1][1]['CodeLines']) == 0:
        code_statistics.pop(-1)
    else:
        code_statistics[-1][1]['NumberOfSnippets'] = cnt
        code_statistics[-1][1]['TotalCodeLines'] = np.sum(code_statistics[-1][1]['CodeLines'])
        code_statistics[-1][1]['AvgSnippetCodeLines'] = np.mean(code_statistics[-1][1]['CodeLines'])
        code_statistics[-1][1]['MaintainabilityIndexCodeLinesAvg'] = np.multiply(
                                                                        code_statistics[-1][1]['MaintainabilityIndex'],
                                                                        code_statistics[-1][1]['CodeLines']
                                                                        ).sum() / np.sum(code_statistics[-1][1]['CodeLines'])
        code_statistics[-1][1]['MaintainabilityIndexUniformAvg'] = np.mean(code_statistics[-1][1]['MaintainabilityIndex'])
        code_statistics[-1][1].pop('CodeLines')
        code_statistics[-1][1].pop('MaintainabilityIndex')

100%|██████████| 27705/27705 [02:13<00:00, 207.52it/s]


In [None]:
ds_new = pd.merge(ds, pd.DataFrame([{"AnswerId_": t[0], **t[1]} for t in code_statistics]), left_on="AnswerId", right_on="AnswerId_").drop(columns=["AnswerId_"])

In [None]:
ds_new.shape, ds_new.isna().sum()

((27705, 44),
 QuestionId                              0
 AcceptedAnswerId                        0
 QuestionScore                           0
 Question                                0
 Tags                                    0
 AnswerId                                0
 AnswerScore                             0
 Answer                                  0
 AvgCosineSimilarity                   126
 AvgL1NormCosineSimilarity             126
 AvgL2NormCosineSimilarity             126
 SphericalAvgL1Dist                    126
 SphericalAvgL2Dist                    126
 SphericalAvgCosineSimilarity          126
 L2NormDirichletParamsL1Dist           126
 L2NormDirichletParamsL2Dist           126
 L2NormDirichletQAKLDiveregence        126
 L2NormDirichletAQKLDiveregence        126
 ArctanNormDirichletParamsL1Dist       126
 ArctanNormDirichletParamsL2Dist       126
 ArctanNormDirichletQAKLDiveregence    126
 ArctanNormDirichletAQKLDiveregence    126
 DiagNormQAKLDiveregence               1

In [None]:
ds_new.to_csv("dataset.csv", index=False)