In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pingouin as pg
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr

### Data loading

In [2]:
origin = pd.read_csv("/Users/rmgaliullin/cam_data_600k/data/all.csv")

  origin = pd.read_csv("/Users/rmgaliullin/cam_data_600k/data/all.csv")


In [3]:
df = origin[["repo", "java_file"]]
columns = ['cc', 'nobl', 'loc', 'hse', 'hsd', 'hsv', 'coco', 'midx', 'nocl']
# Perform inner join on 'repo' and 'java_file'
merged_df = df.copy()
for i in columns:
    temp = pd.read_csv(f"/Users/rmgaliullin/cam_data_600k/data/{i}.csv")
    temp = temp.drop_duplicates(['repo', 'java_file'])
    merged_df = pd.merge(merged_df, temp, on=['repo', 'java_file'], how='inner')

In [4]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606000 entries, 0 to 605999
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   repo       606000 non-null  object 
 1   java_file  606000 non-null  object 
 2   cc         606000 non-null  int64  
 3   nobl       606000 non-null  int64  
 4   loc        606000 non-null  int64  
 5   hse        606000 non-null  float64
 6   hsd        606000 non-null  float64
 7   hsv        606000 non-null  float64
 8   coco       606000 non-null  int64  
 9   midx       606000 non-null  float64
 10  nocl       606000 non-null  int64  
dtypes: float64(4), int64(5), object(2)
memory usage: 50.9+ MB


#### Check that all.csv == concatenation of loc.csv, cc.csv etc

In [5]:
test = pd.merge(origin, merged_df, on=['repo', 'java_file'], suffixes=('_df1', '_df2'), how='right')
test= test[test['repo'] != 'yegor256/jaxec']

assert (test['cc_df1'].astype(int) == test['cc_df2']).all()
assert (test['coco_df1'].astype(int) == test['coco_df2']).all()
assert (test['hsv_df1'].astype(float) == test['hsv_df2']).all()
assert (test['hse_df1'].astype(float) == test['hse_df2']).all()
assert (test['hsd_df1'].astype(float) == test['hsd_df2']).all()
assert (test['loc_df1'].astype(int) == test['loc_df2']).all()
assert (test['nobl_df1'].astype(int) == test['nobl_df2']).all()
assert (test['midx_df1'].astype(float) == test['midx_df2']).all()
assert (test['nocl_df1'].astype(int) == test['nocl_df2']).all()
# pd.merge(origin[origin['loc']=='-'], merged_df, on=['repo', 'java_file'], suffixes=('_df1', '_df2'), how='inner')
merged_df = merged_df[merged_df['repo'] != 'yegor256/jaxec']

In [6]:
result_df = merged_df.copy()
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 605966 entries, 17 to 605999
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   repo       605966 non-null  object 
 1   java_file  605966 non-null  object 
 2   cc         605966 non-null  int64  
 3   nobl       605966 non-null  int64  
 4   loc        605966 non-null  int64  
 5   hse        605966 non-null  float64
 6   hsd        605966 non-null  float64
 7   hsv        605966 non-null  float64
 8   coco       605966 non-null  int64  
 9   midx       605966 non-null  float64
 10  nocl       605966 non-null  int64  
dtypes: float64(4), int64(5), object(2)
memory usage: 55.5+ MB


### Add column bl_ratio = nobl / loc

In [7]:
result_df.loc[:, "bl_ratio"] = merged_df["nobl"] / merged_df["loc"]
result_df.loc[:, "cl_ratio"] = merged_df["nocl"] / merged_df["loc"]

In [8]:
result_df[(result_df['loc'] > 3) & (result_df['loc'] < 40)].head()

Unnamed: 0,repo,java_file,cc,nobl,loc,hse,hsd,hsv,coco,midx,nocl,bl_ratio,cl_ratio
18,databricks/learning-spark,/src/main/java/com/oreilly/learningsparkexampl...,2,0,8,3621.689,11.5,314.929,1,100.0,0,0.0,0.0
19,databricks/learning-spark,/src/main/java/com/oreilly/learningsparkexampl...,1,0,13,9038.089,12.833,704.267,0,100.0,4,0.0,0.307692
20,databricks/learning-spark,/src/main/java/com/oreilly/learningsparkexampl...,1,0,14,9428.815,13.707,687.888,0,100.0,7,0.0,0.5
23,databricks/learning-spark,/src/main/java/com/oreilly/learningsparkexampl...,2,1,12,4849.342,11.974,405.0,1,100.0,0,0.083333,0.0
24,databricks/learning-spark,/src/main/java/com/oreilly/learningsparkexampl...,2,1,32,32819.043,20.798,1578.0,2,99.414,0,0.03125,0.0


#### Shape of datasets: all methods, methods with 3 < loc <= 40 and methods with 3 < loc <= 40 and nobl > 0

In [9]:
# Extract the relevant columns
selected_columns = ['cc', 'nobl', 'loc', 'hse', 'hsd', 'hsv', 'coco', 'midx', 'nocl', 'bl_ratio', 'cl_ratio']
subset_df = result_df[selected_columns]
subset_df_3_40 = subset_df[(subset_df['loc'] <= 40) & (subset_df['loc'] > 3)]
subset_df_3_40_with_non_zero_bl = subset_df[(subset_df['loc'] <= 40) & (subset_df['loc'] > 3) & (subset_df['nobl'] > 0)]
subset_df.shape, subset_df_3_40.shape, subset_df_3_40_with_non_zero_bl.shape

((605966, 11), (256559, 11), (78229, 11))

#### Look at total correlation

In [10]:
corr_df1 = subset_df.corr()[['nobl', 'bl_ratio']]
# corr_df1

In [11]:
corr_df2 = subset_df_3_40.corr()[['nobl', 'bl_ratio']]
# corr_df2

In [12]:
corr_df3 = subset_df_3_40_with_non_zero_bl.corr()[['nobl', 'bl_ratio']]
# corr_df3

In [13]:
# Concatenate the DataFrames with a multi-index
keys = ['All methods', 'Methods 3 < loc < 41', 'Methods 3 < loc < 41, nobl > 0']
result_df = pd.concat([corr_df1, corr_df2, corr_df3], axis=1, keys=keys)
result_df

Unnamed: 0_level_0,All methods,All methods,Methods 3 < loc < 41,Methods 3 < loc < 41,"Methods 3 < loc < 41, nobl > 0","Methods 3 < loc < 41, nobl > 0"
Unnamed: 0_level_1,nobl,bl_ratio,nobl,bl_ratio,nobl,bl_ratio
cc,0.356849,0.142551,0.274359,0.101978,0.228543,-0.210173
nobl,1.0,0.531144,1.0,0.8238,1.0,0.599016
loc,0.72639,0.227269,0.551058,0.275265,0.50945,-0.237877
hse,0.3254,0.035472,0.446565,0.220579,0.391224,-0.095871
hsd,0.523015,0.276023,0.342076,0.182889,0.311213,-0.122074
hsv,0.584066,0.156707,0.535535,0.292651,0.481655,-0.130333
coco,0.301803,0.100712,0.287621,0.101423,0.212888,-0.227866
midx,-0.583841,-0.127105,-0.208192,-0.073215,-0.213674,0.021376
nocl,0.56864,0.251919,0.410333,0.29337,0.400286,0.221354
bl_ratio,0.531144,1.0,0.8238,1.0,0.599016,1.0


### Results

- nobl явно коррелирует с loc, даже в методах, где требуется обязательно хотя бы 1 пустая строка (коэфф > 0.5)
- nobl сильно коррелирует с hs* метриками за счет того, что метода становятся не витиеватыми, а перенасыщенными вызывами и внутренней работой (коэфф около 0.3)
- нужно проверить с чем коррелирует nobl, **невелировав** влияние loc, потому что это известная зависимость

#### Look at partial correlation, controlling loc parametr

In [14]:
control = 'loc'
features = ['nobl', 'bl_ratio', 'cl_ratio']
metrics = ['cc', 'coco', 'hse', 'hsv', 'hsd', 'midx']

partial_correlations = []
# Iterate through feature and metric combinations
for feature in features:
    for metric in metrics:
        # Calculate partial correlation
        partial_corr = pg.partial_corr(data=subset_df, x=metric, y=feature, x_covar=control)
        partial_corr['feature'] = feature
        partial_corr['metric'] = metric
        partial_correlations.append(partial_corr)

print("Correlation between method's property and metric of the code complexity, controlling loc value. All methods")
result_partial_corr = pd.concat(partial_correlations)
result_partial_corr.set_index(['feature', 'metric'], inplace=True)
result_partial_corr

Correlation between method's property and metric of the code complexity, controlling loc value. All methods


Unnamed: 0_level_0,Unnamed: 1_level_0,n,r,CI95%,p-val
feature,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nobl,cc,605966,-0.217338,"[-0.22, -0.21]",0.0
nobl,coco,605966,-0.099415,"[-0.1, -0.1]",0.0
nobl,hse,605966,-0.073582,"[-0.08, -0.07]",0.0
nobl,hsv,605966,-0.063508,"[-0.07, -0.06]",0.0
nobl,hsd,605966,0.05339,"[0.05, 0.06]",0.0
nobl,midx,605966,-0.006933,"[-0.01, -0.0]",6.791614e-08
bl_ratio,cc,605966,-0.0245,"[-0.03, -0.02]",4.132602e-81
bl_ratio,coco,605966,-0.023685,"[-0.03, -0.02]",6.285571e-76
bl_ratio,hse,605966,-0.101462,"[-0.1, -0.1]",0.0
bl_ratio,hsv,605966,-0.069305,"[-0.07, -0.07]",0.0


In [15]:
partial_correlations_3_40 = []
# Iterate through feature and metric combinations
for feature in features:
    for metric in metrics:
        # Calculate partial correlation
        partial_corr = pg.partial_corr(data=subset_df_3_40, x=metric, y=feature, x_covar=control)
        partial_corr['feature'] = feature
        partial_corr['metric'] = metric
        partial_correlations_3_40.append(partial_corr)

print("Correlation between method's property and metric of the code complexity, controlling loc value. Methods 3 < loc < 41")
result_partial_corr_3_40 = pd.concat(partial_correlations_3_40)
result_partial_corr_3_40.set_index(['feature', 'metric'], inplace=True)
result_partial_corr_3_40

Correlation between method's property and metric of the code complexity, controlling loc value. Methods 3 < loc < 41


Unnamed: 0_level_0,Unnamed: 1_level_0,n,r,CI95%,p-val
feature,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nobl,cc,256559,-0.146367,"[-0.15, -0.14]",0.0
nobl,coco,256559,-0.125138,"[-0.13, -0.12]",0.0
nobl,hse,256559,0.07074,"[0.07, 0.07]",7.251133e-282
nobl,hsv,256559,0.122308,"[0.12, 0.13]",0.0
nobl,hsd,256559,0.007627,"[0.0, 0.01]",0.000111898
nobl,midx,256559,-0.031842,"[-0.04, -0.03]",1.50794e-58
bl_ratio,cc,256559,-0.121572,"[-0.13, -0.12]",0.0
bl_ratio,coco,256559,-0.120649,"[-0.12, -0.12]",0.0
bl_ratio,hse,256559,0.031741,"[0.03, 0.04]",3.462307e-58
bl_ratio,hsv,256559,0.109996,"[0.11, 0.11]",0.0


In [16]:
partial_correlations_3_40_with_non_zero_bl = []
# Iterate through feature and metric combinations
for feature in features:
    for metric in metrics:
        # Calculate partial correlation
        partial_corr = pg.partial_corr(data=subset_df_3_40_with_non_zero_bl, x=metric, y=feature, x_covar=control)
        partial_corr['feature'] = feature
        partial_corr['metric'] = metric
        partial_correlations_3_40_with_non_zero_bl.append(partial_corr)

print("Correlation between method's property and metric of the code complexity, controlling loc value. Methods 3 < loc < 41, nobl > 0")
result_partial_corr_3_40_with_non_zero_bl = pd.concat(partial_correlations_3_40_with_non_zero_bl)
result_partial_corr_3_40_with_non_zero_bl.set_index(['feature', 'metric'], inplace=True)
result_partial_corr_3_40_with_non_zero_bl

Correlation between method's property and metric of the code complexity, controlling loc value. Methods 3 < loc < 41, nobl > 0


Unnamed: 0_level_0,Unnamed: 1_level_0,n,r,CI95%,p-val
feature,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nobl,cc,78229,-0.098314,"[-0.11, -0.09]",3.007608e-167
nobl,coco,78229,-0.133004,"[-0.14, -0.13]",1.406153e-305
nobl,hse,78229,0.072108,"[0.07, 0.08]",1.105498e-90
nobl,hsv,78229,0.112081,"[0.11, 0.12]",4.639923e-217
nobl,hsd,78229,0.022647,"[0.02, 0.03]",2.375113e-10
nobl,midx,78229,-0.044439,"[-0.05, -0.04]",1.680458e-35
bl_ratio,cc,78229,-0.083738,"[-0.09, -0.08]",1.005777e-121
bl_ratio,coco,78229,-0.101999,"[-0.11, -0.1]",6.233998e-180
bl_ratio,hse,78229,0.082122,"[0.08, 0.09]",3.939361e-117
bl_ratio,hsv,78229,0.112598,"[0.11, 0.12]",4.6255050000000006e-219


In [17]:
# Concatenate the DataFrames with a multi-index
keys = ['All methods', 'Methods 3 < loc < 41', 'Methods 3 < loc < 41, nobl > 0']
result_partial_corr_df = pd.concat([result_partial_corr.loc[:, 'r'], result_partial_corr_3_40.loc[:, 'r'], result_partial_corr_3_40_with_non_zero_bl.loc[:, 'r']], axis=1, keys=keys)
result_partial_corr_df

Unnamed: 0_level_0,Unnamed: 1_level_0,All methods,Methods 3 < loc < 41,"Methods 3 < loc < 41, nobl > 0"
feature,metric,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
nobl,cc,-0.217338,-0.146367,-0.098314
nobl,coco,-0.099415,-0.125138,-0.133004
nobl,hse,-0.073582,0.07074,0.072108
nobl,hsv,-0.063508,0.122308,0.112081
nobl,hsd,0.05339,0.007627,0.022647
nobl,midx,-0.006933,-0.031842,-0.044439
bl_ratio,cc,-0.0245,-0.121572,-0.083738
bl_ratio,coco,-0.023685,-0.120649,-0.101999
bl_ratio,hse,-0.101462,0.031741,0.082122
bl_ratio,hsv,-0.069305,0.109996,0.112598


### Results

- коэфф корелляции nobl и hsv = 0.12 в Methods 3 < loc < 41	и Methods 3 < loc < 41, nobl > 0
- коэфф корелляции bl_ratio и hsv = 0.12 в Methods 3 < loc < 41	и Methods 3 < loc < 41, nobl > 0
- отрицательная корреляция nobl/bl_ratio с cc обясняется тем, что условные операторы/switch не разбиваются пробелами
- коэфф корелляции bl_ratio и hsd = 0.16 в All methods

In [26]:
correlation_results = []

for column in ['cc', 'loc', 'hse', 'hsd', 'hsv', 'coco', 'midx']:
    correlation_coefficient_nobl, _ = spearmanr(subset_df['bl_ratio'], subset_df[column])
    correlation_coefficient_bl_ratio, _ = spearmanr(subset_df['nobl'], subset_df[column])
    
    correlation_results.append({
            'Variable': column,
            # 'nobl_Coefficient': correlation_coefficient_nobl,
            'bl_ratio_coefficient': correlation_coefficient_bl_ratio,
        })

correlation_df = pd.DataFrame(correlation_results)

correlation_df

Unnamed: 0,Variable,bl_ratio_coefficient
0,cc,0.447345
1,loc,0.553372
2,hse,0.504927
3,hsd,0.423918
4,hsv,0.513503
5,coco,0.443867
6,midx,-0.385191


In [27]:
correlation_results = []

for column in ['cc', 'loc', 'hse', 'hsd', 'hsv', 'coco', 'midx']:
    correlation_coefficient_nobl, _ = spearmanr(subset_df_3_40['bl_ratio'], subset_df_3_40[column])
    correlation_coefficient_bl_ratio, _ = spearmanr(subset_df_3_40['nobl'], subset_df_3_40[column])
    
    correlation_results.append({
            'Variable': column,
            'bl_ratio_coefficient': correlation_coefficient_bl_ratio,
        })

correlation_df = pd.DataFrame(correlation_results)

correlation_df

Unnamed: 0,Variable,bl_ratio_coefficient
0,cc,0.25842
1,loc,0.508142
2,hse,0.485622
3,hsd,0.329522
4,hsv,0.510507
5,coco,0.253127
6,midx,-0.249628


In [28]:
correlation_results = []

for column in ['cc', 'loc', 'hse', 'hsd', 'hsv', 'coco', 'midx']:
    correlation_coefficient_nobl, _ = spearmanr(subset_df_3_40_with_non_zero_bl['bl_ratio'], subset_df_3_40_with_non_zero_bl[column])
    correlation_coefficient_bl_ratio, _ = spearmanr(subset_df_3_40_with_non_zero_bl['nobl'], subset_df_3_40_with_non_zero_bl[column])
    
    correlation_results.append({
            'Variable': column,
            'bl_ratio_coefficient': correlation_coefficient_bl_ratio,
        })

correlation_df = pd.DataFrame(correlation_results)

correlation_df

Unnamed: 0,Variable,bl_ratio_coefficient
0,cc,0.220029
1,loc,0.496382
2,hse,0.483027
3,hsd,0.359947
4,hsv,0.496105
5,coco,0.205281
6,midx,-0.292001
