# Libraries

In [2]:
import pandas as pd
import numpy as np
import math
import pickle

from scipy import stats
import scipy.io
from scipy.spatial.distance import pdist
from scipy.linalg import cholesky
from scipy.io import loadmat

import matlab.engine as engi
import matlab as mat

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,roc_auc_score,recall_score,precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from src import SMOTE
from src import CFS
from src import metrices

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback
from pathlib import Path

import matplotlib.pyplot as plt

# Start matlab service

In [3]:
eng = engi.start_matlab()
eng.addpath(r'src/matlab_CTKCCA/',nargout=0)
eng.addpath(r'src/matlab_KS/',nargout=0)

# variables

In [3]:
result_path = 'result/result.csv'
repeats = 20
ratio = 0.1
lrank = 70
reg = 1E-5

# Data loading and Normalizing Data

In [13]:
def load_data(project):
    understand_path = 'data/understand_files_all/' + project + '_understand.csv'
    commit_guru_path = 'data/commit_guru/' + project + '.csv'
    understand_df = pd.read_csv(understand_path)
    understand_df = understand_df.dropna(axis = 1,how='all')
    cols_list = understand_df.columns.values.tolist()
    for item in ['Kind', 'Name','commit_hash', 'Bugs']:
        if item in cols_list:
            cols_list.remove(item)
            cols_list.insert(0,item)
    understand_df = understand_df[cols_list]
    commit_guru_df = pd.read_csv(commit_guru_path)
    cols = understand_df.columns.tolist()
    
    commit_guru_df = commit_guru_df.drop(labels = ['parent_hashes','author_name','author_name',
                                                   'author_email','fileschanged','author_date',
                                                   'author_date_unix_timestamp', 'commit_message',
                                                  'classification', 'fix', 'contains_bug','fixes',],axis=1)

#     print(commit_guru_df.columns)
    understand_df = understand_df.drop_duplicates(cols[4:len(cols)])
    df = understand_df.merge(commit_guru_df,on='commit_hash')
    cols = df.columns.tolist()
    cols = cols[1:] + [cols[0]]
    df = df[cols]
    for item in ['Kind', 'Name','commit_hash']:
        if item in cols:
            df = df.drop(labels = [item],axis=1)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    s_df,s_cols = apply_cfs(df)
    y = s_df.Bugs
    X = s_df.drop('Bugs',axis = 1)
    cols = X.columns
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X,columns = cols)
    s_df = pd.concat([X,y],axis = 1)
    return df,s_df,s_cols

def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def apply_cfs(df):
        y = df.Bugs.values
        X = df.drop(labels = ['Bugs'],axis = 1)
        X = X.values
        selected_cols = CFS.cfs(X,y)
        cols = df.columns[[selected_cols]].tolist()
        cols.append('Bugs')
        return df[cols],cols

# Matlab integration
## Matlab integration - CTKCCA

In [8]:
def CTKCCA(source_df,target_df):
    mat_source_df = mat.double(source_df.values.T.tolist())
    mat_target_df = mat.double(target_df.values.T.tolist())
    X = eng.CTKCCA(mat_source_df,mat_target_df,nargout=4)
    train_X,train_y = np.array(X[0]),np.array(X[1]).tolist()[0]
    test_X,test_y = np.array(X[2]),np.array(X[3]).tolist()[0]
    return train_X,train_y,test_X,test_y

# Teting using original Data

## get train test data

In [None]:
proj_df = pd.read_csv('projects.csv')
projects = proj_df.repo_name.tolist()
for s_project in projects:
    _,source_df,s_cols = load_data(s_project)
    source_df = apply_smote(source_df)
    train_y = source_df.Bugs
    train_X = source_df.drop('Bugs',axis = 1)
    clf = LogisticRegression()
    clf.fit(train_X,train_y)
    for d_project in projects:
        original_df,target_df,d_cols = load_data(d_project)
        # With Transformed Metrics
        trasformed_train_X,trasformed_train_y,trasformed_test_X,trasformed_test_y = CTKCCA(source_df,target_df)
        train_df = pd.DataFrame(trasformed_train_X)
        train_df['Buggy'] = trasformed_train_y
        trasformed_train_y = train_df.Buggy
        trasformed_train_X = train_df.drop('Buggy',axis = 1)
        t_clf = LogisticRegression()
        t_clf.fit(trasformed_train_X,trasformed_train_y)
        t_predicted = t_clf.predict(trasformed_test_X)
        print(d_project,"+++++++++++++++++++++++++++++++++++++++++")
        print(classification_report(trasformed_test_y, t_predicted))
        
        # Without Transformed Metrics
        original_df = original_df[s_cols]
        test_y = original_df.Bugs
        test_X = original_df.drop('Bugs',axis = 1)
        predicted = clf.predict(test_X)
        print(classification_report(test_y, predicted))
    break    