In [None]:
import operator
import os
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
import multiset

from utils import load_embeddings, load_projects, load_stopwords

In [None]:
projects, imapping = load_projects('.', 'java-projects - java-projects (1).csv')
labels = {k:v for k,v in zip(projects['names'], projects['labels_id'])}

In [None]:
path = "resources/java/stopwords.txt"
stopwords = load_stopwords(path)
path = "resources/en/stopwords.txt"
stopwords.update(load_stopwords(path))

In [None]:

terms_path = '../data/embeddings/terms-count/'
mapping = {v:k for k,v in imapping.items()}

In [None]:
from textblob import  Word

category_terms_count = defaultdict(lambda: Counter())
category_terms_occ = defaultdict(lambda: Counter())
text = []
lab = []
for project in labels:
    category = mapping[labels[project]]
    try:
        terms_count = load_embeddings(os.path.join(terms_path, f"{project}.vec"))
        terms = []
        for x, y in terms_count.items():
            lemma = Word(x).lemmatize()
            if lemma not in stopwords and len(x) > 1 and x not in stopwords:
                tokens = [lemma] * int(y[0])
                terms.extend(tokens)
        #terms = [[Word(x).lemmatize()] * int(y[0]) for x, y in terms_count.items() if Word(x).lemmatize() not in stopwords and len(x) > 1]
        text.append(" ".join(terms))
        lab.append(category)
    except Exception as e:
        print(e)

In [None]:
import pandas

df = pandas.DataFrame({'text': text, 'label': lab})
df = df[~df['label'].isin(['NA', 'Miscellaneous'])]

y = df['label'].to_frame()
X = df['text']

X_train, X_test, y_train, y_test = train_test_split(
        X, y,stratify=y, test_size=0.3)

In [None]:
category_terms_occ = defaultdict(lambda: multiset.Multiset())

for label in set(y['label'].tolist()):
    corpus = X_train[y_train['label']==label].tolist()

    for sample in corpus:
        terms = sample.split(" ")
        category_terms_occ[label].update(terms)

In [None]:
all = 0
true = 0
rsults = {}
tl = []
predl = []
for label in set(y['label'].tolist()):
    corpus = X_test[y_test['label']==label].tolist()

    for i, sample in enumerate(corpus):
        terms = multiset.Multiset(sample.split(" "))
        scores = {}
        for l in y['label'].tolist():
            cat_terms = category_terms_occ[l]
            scores[l] = len(terms.intersection(cat_terms)) / len(terms.union(cat_terms))

        all += 1
        gt = y_test['label'].tolist()[i]
        pred = max(scores.items(), key=operator.itemgetter(1))[0]
        tl.append(gt)
        predl.append(pred)
        if gt == pred:
            true += 1



In [None]:
print(true/all)

In [None]:
true

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(tl, predl)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)
X_t_mat = vectorizer.transform(X_train)
X_s_mat = vectorizer.transform(X_test)

In [None]:
neigh = KNeighborsClassifier(n_neighbors=7)


In [None]:
neigh.fit(X_t_mat, y_train)

In [None]:

predictions = neigh.predict(X_s_mat)

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

In [None]:
import autosklearn.classification

automl = autosklearn.classification.AutoSklearnClassifier(
    n_jobs=1,
    memory_limit=140072,
    per_run_time_limit=30,
    tmp_folder='/tmp/autosklearn_parallel_1_example_tmp',
    output_folder='/tmp/autosklearn_parallel_1_example_out',
    ensemble_size=1,
    include_preprocessors=["no_preprocessing"],
    exclude_estimators=['liblinear_svc', 'libsvm_svc', 'mlp']
)
automl.fit(X_t_mat, y_train)
y_hat = automl.predict(X_s_mat)
print("Accuracy score", classification_report(y_test, y_hat))

In [None]:
from tpot import TPOTClassifier
tpot = TPOTClassifier(
    generations=5,
    population_size=50,
    verbosity=2, random_state=42)
tpot.fit(X_t_mat.todense(), y_train)
print(tpot.score(X_s_mat.todense(), y_test))
tpot.export('tpot_terms_pipeline.py')