In [22]:
import pandas as pd
import numpy as np
import os
import pathlib
from collections import defaultdict
from tqdm import tqdm
import seaborn as sns
from matplotlib import pyplot as plt
import random

In [2]:
codes_dir = pathlib.Path('./code')
embedding_dir = pathlib.Path('./embeddings')
negative_samples_dir = pathlib.Path('./embeddings/negative_samples')

In [3]:
antipatterns = list(embedding_dir.glob('*'))
antipatterns_dict = {}
for antipattern in antipatterns:
    if antipattern != negative_samples_dir:
        antipatterns_dict[antipattern.name] = list(antipattern.glob('*'))

In [4]:
antipatterns_dict.keys()

dict_keys(['parallel_inheritance_hierarchies', 'god_classes', 'data_class', 'feature_envy'])

In [5]:
labels = defaultdict(lambda : [False, False, False, False, ''])
name2label = {name:i for name, i in zip(antipatterns_dict, range(len(antipatterns_dict)))}
label2name = {value:key for key, value in name2label.items()}
for name, paths in antipatterns_dict.items():
    for path in paths:
        labels[os.path.basename(path)][name2label[name]] = True
        labels[os.path.basename(path)][-1] = path

In [6]:
def get_embedding(filename):
    return np.array([float(x) for x in filename.open().read().split()])

for name, values in labels.items():
    labels[name].append(get_embedding(values[-1]).reshape(384, -1))

In [15]:
negative_samples = random.sample(list(negative_samples_dir.glob('*')), 3000)

In [16]:
for negative_sample in negative_samples:
    embedding = get_embedding(negative_sample).reshape(384, -1)
    labels[negative_sample.name] = [False, False, False, False, negative_sample, embedding]

In [17]:
name2label

{'parallel_inheritance_hierarchies': 0,
 'god_classes': 1,
 'data_class': 2,
 'feature_envy': 3}

In [18]:
def get_train_labels(name):
    return (
        [np.mean(values[-1], axis=1) for values in labels.values()], 
        [values[name2label[name]] for values in labels.values()]
    )

In [19]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

def fit(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
    svm = SVC()
    svm.fit(X_train, y_train)
    preds = svm.predict(X_test)
    print(classification_report(y_test, preds))

## Data Class

In [None]:
X, y = get_train_labels('data_class')
fit(X, y)

## God Classes

In [None]:
X, y = get_train_labels('god_classes')
fit(X, y)

## Feature envy

In [None]:
X, y = get_train_labels('feature_envy')
fit(X, y)

## Parallel inheritance

In [None]:
X, y = get_train_labels('parallel_inheritance_hierarchies')
fit(X, y)

## Skmultilearn

In [20]:
X, y = (
    np.array([np.mean(values[-1], axis=1) for values in labels.values()]), 
    np.array([values[:4] for values in labels.values()])
)

In [21]:
# Import BinaryRelevance from skmultilearn
from skmultilearn.problem_transform import BinaryRelevance

# Import SVC classifier from sklearn
from sklearn.svm import SVC

# Setup the classifier
classifier = BinaryRelevance(classifier=SVC(verbose=1), require_dense=[False,True])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Train
classifier.fit(X_train, y_train)

[LibSVM][LibSVM][LibSVM][LibSVM]

BinaryRelevance(classifier=SVC(verbose=1), require_dense=[False, True])

In [None]:
y_test[:10]

In [None]:
y_pred[:10].todense()

In [26]:
import pickle

with open("bin_rel_svc_3000_neg_samples.pckl", "wb") as f:
    pickle.dump(classifier, f)

In [23]:
# Predict
y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.17      0.29       925
           1       0.74      0.65      0.70       680
           2       0.68      0.67      0.67      1268
           3       0.86      1.00      0.92      4900

   micro avg       0.82      0.81      0.82      7773
   macro avg       0.81      0.62      0.65      7773
weighted avg       0.83      0.81      0.79      7773
 samples avg       0.82      0.77      0.77      7773



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
total = 0
correct = 0
for y, y_hat in zip(y_test, np.asarray(y_pred.todense())):
    if (y == y_hat).all():
        correct += 1
    total += 1

correct / total

0.6289393425957303

## Experiments

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from skmultilearn.problem_transform import ClassifierChain, LabelPowerset
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.cluster.networkx import NetworkXLabelGraphClusterer
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
from skmultilearn.ensemble import LabelSpacePartitioningClassifier

from sklearn.svm import SVC

parameters = {
    'classifier': [BinaryRelevance(), ClassifierChain()],
    'classifier__classifier': [RandomForestClassifier()],
    'classifier__classifier__n_estimators': [10, 20, 50],
    
    'clusterer' : [
        NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
        NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
    ]
}

clf = GridSearchCV(LabelSpacePartitioningClassifier(), parameters, scoring = 'f1_macro')
clf.fit(X_train, y_train)

print (clf.best_params_, clf.best_score_)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

parameters = {'k': range(1,6), 's': [0.0, 0.5, 0.7, 1.0]}
score = 'f1_micro'

clf = GridSearchCV(MLkNN(), parameters, scoring=score)
clf.fit(X_train, y_train)

print (clf.best_params_, clf.best_score_)

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

In [None]:
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
edge_map = graph_builder.transform(y_train)
print("{} labels, {} edges".format(4, len(edge_map)))
print(edge_map)

In [None]:
{(label2name[key[0]],label2name[key[1]]):value for key, value in edge_map.items()}

In [None]:
import networkx as nx
# we define a helper function for visualization purposes
def to_membership_vector(partition):
    return {
        member :  partition_id
        for partition_id, members in enumerate(partition)
        for member in members
    }
clusterer = NetworkXLabelGraphClusterer(graph_builder, method='louvain')
partition = clusterer.fit_predict(X_train,y_train)

membership_vector = to_membership_vector(partition)

names_dict = dict(enumerate(x for x in ['data', 'god', 'envy', 'inheritance']))

In [None]:
nx.draw(
    clusterer.graph_,
    pos=nx.circular_layout(clusterer.graph_),
    labels=names_dict,
    with_labels = True,
    width = [10*x/y_train.shape[0] for x in clusterer.weights_['weight']],
    node_color = [membership_vector[i] for i in range(y_train.shape[1])],
    cmap=plt.cm.Spectral,
    node_size=100,
    font_size=16
)

Reference: [Scikit Multilearn doc](http://scikit.ml/index.html)