In [1]:
from math import ceil
from sklearn.utils import shuffle
import pandas as pd
import numpy as np 
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import model_selection 
import seaborn as sns 
from sklearn.model_selection import StratifiedKFold 
from sklearn import metrics
from numpy import mean
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import os

In [2]:
def getDataframe(p):
    print(f"file name : {os.path.split(p)[1]}")
    if os.path.split(p)[1] == 'JM1.csv':
        df = pd.read_csv(p)
        df['label'] = df['label'].map({"b'N'" :False,"b'Y'" :True})
        return df
    else:
        df = pd.read_csv(p)
        df['Defective'] = df['Defective'].map({"b'N'" :False,"b'Y'" :True})
        return df


# preprecessing 

def normalize(df):
    result = df.copy()
    for col in df.iloc[:,:-1].columns:
        max_value = df[col].max()
        min_value = df[col].min()
        result[col] = (df[col] - min_value) / (max_value - min_value)
    # print(df.head())
    return result

def preprocessing(df):
    # select k best columns
    best_features = SelectKBest(score_func=chi2,k=10)
    fit = best_features.fit(df.iloc[:,:-1],df.iloc[:,-1])

    dfscores = pd.DataFrame(fit.scores_)
    df_cols = pd.DataFrame(df.iloc[:,:-1].columns)

    feature_scores = pd.concat([df_cols,dfscores],axis=1)
    feature_scores.columns = ['parameters','score']
    selected_features = feature_scores.nlargest(10,'score').parameters.values
    df = df.drop(selected_features, axis=1)
    # normalize the data
    df = normalize(df)
    return df




# making bags
def sampling(df):
    # create 2 sets of majority and minority samples
    a = df[df.iloc[:,-1] == False]
    b = df[df.iloc[:,-1] == True]
    majority = None
    minority = None
    if a.shape[0] > b.shape[0]:
        # print("here")
        majority = a
        minority = b
    else:
        # print("there")
        minority = b
        majority = a
    # print(majority.head())
    # print(minority.head())

    # no of bags 
    bags = ceil(majority.shape[0]/minority.shape[0]) + 2
    print(f"majority cnt :{majority.shape[0]} , minority cnt :{minority.shape[0]} and number of bags: {bags}")
    subsets = []
    for i in range(bags):
        maj_sample = majority.sample(n=minority.shape[0])
        balanced_subset = pd.concat([minority,maj_sample])
        # print(maj_sample.shape)
        # print(balanced_subset.shape)
        balanced_subset = shuffle(balanced_subset)
        # print(balanced_subset.head())
        subsets.append(balanced_subset)

    return subsets


In [3]:
df = getDataframe("Dataset/JM1.csv")
df = preprocessing(df)
# splitting data into train and test sets
trainSet,testSet = train_test_split(df,test_size=0.1)


file name : JM1.csv


In [4]:
balanced_subsets = sampling(trainSet)

majority cnt :7060 , minority cnt :1571 and number of bags: 7


In [5]:
from fastai.tabular.all import *

In [6]:
def trainModels(balanced_subsets):
    trained_models = []
    for subset in balanced_subsets:
        df = subset
        cont_names = df.columns
        cat_names = []
        procs=[]
        df1 = balanced_subsets[0]
        cont_names = ['LOC_BLANK', 'LOC_CODE_AND_COMMENT', 'LOC_COMMENTS',
            'CYCLOMATIC_COMPLEXITY', 'DESIGN_COMPLEXITY',
            'ESSENTIAL_COMPLEXITY', 'HALSTEAD_CONTENT', 'HALSTEAD_DIFFICULTY',
            'HALSTEAD_ERROR_EST', 'HALSTEAD_LEVEL', 'NUM_UNIQUE_OPERATORS',
            'label']
        cat_names = []
        procs=[]
        # config = tabular_config(embed_p=0.6, use_bn=False); config
        dls = TabularDataLoaders.from_df(df1,path='.',procs=None,cont_names=cont_names[:-1],cat_names=None,y_names=cont_names[-1])
        learn = tabular_learner(dls,[400,200,100,50],metrics=[accuracy,RocAucBinary()])
        learn.fit_one_cycle(10,1e-3)
        # learn.show_training_loop()
        trained_models.append(learn)
    return trained_models


In [7]:
ans = trainModels(balanced_subsets)

In [37]:
def majorityVoteWieghtage(trained_models, testSet):
    # finalpreds = []
    fp = []
    for model in trained_models:
        dl_test = model.dls.test_dl(testSet, with_labels=true)
        preds = model.get_preds(dl=dl_test)
        # print(preds)
        fp.append((preds[1].T.squeeze())
    t1 = torch.stack(fp)
    t1 = t1.T
    print(len(t1))
    finalpreds = []
    cnt = 0
    for i in range(t1.shape[0]):
        
        # if((t1[i] == 1).sum(dim=0) > 0):
        #     cnt+=1
        #     print(f"sum {t1[i]} {(t1[i] == 1).sum(dim=0) > 0}")
        tr = 0
        fl = 0
        for x in t1[i]:
            if x == 1:
                tr+=1
            else:
                fl+=1
        
        if tr > fl:
            cnt+=1
            print(f"tr: {tr} fl: {fl} total {t1.shape[0]}")
            # print()
        finalpreds.append([1 if tr > fl else 0,fl/t1.shape[1],tr/t1.shape[1]])
    
    print(cnt)
    return finalpreds
        


SyntaxError: invalid syntax (<ipython-input-37-b50c6215ffcb>, line 9)

In [46]:
def majorityVoteWieghtage(trained_models, testSet):
    # finalpreds = []
    fp = []
    probs = []
    for model in trained_models:
        dl_test = model.dls.test_dl(testSet, with_labels=True)
        preds = model.get_preds(dl=dl_test)
        # print(preds)
        fp.append(preds[1].T.squeeze())
        probs.append(preds[0].T.squeeze())
    print(probs[0].shape)
    t1 = torch.stack(fp)
    t1 = t1.T
    p1 = torch.stack(probs)
    p1 = p1.T
    print(t1.shape)
    print(t1)
    print(p1.shape)
    print(p1)
    finalpreds = []
    cnt = 0
    for i in range(t1.shape[0]):
        
        # if((t1[i] == 1).sum(dim=0) > 0):
        #     cnt+=1
        #     print(f"sum {t1[i]} {(t1[i] == 1).sum(dim=0) > 0}")
        tr = 0
        fl = 0
        for x in t1[i]:
            if x == 1:
                tr+=1
            else:
                fl+=1
        
        if tr > fl:
            cnt+=1
            print(f"tr: {tr} fl: {fl} total {t1.shape[0]}")
            # print()
        finalpreds.append([1 if tr > fl else 0,fl/t1.shape[1],tr/t1.shape[1]])
    
    print(cnt)
    return finalpreds
        

In [47]:
fp = majorityVoteWieghtage(ans,testSet)

torch.Size([2, 960])
torch.Size([960, 7])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int8)
torch.Size([960, 2, 7])
tensor([[[0.6004, 0.5767, 0.6532,  ..., 0.6154, 0.5954, 0.5434],
         [0.3996, 0.4233, 0.3468,  ..., 0.3846, 0.4046, 0.4566]],

        [[0.5135, 0.5404, 0.5487,  ..., 0.5419, 0.5174, 0.5740],
         [0.4865, 0.4596, 0.4513,  ..., 0.4581, 0.4826, 0.4260]],

        [[0.6652, 0.6511, 0.6070,  ..., 0.6800, 0.6791, 0.5829],
         [0.3348, 0.3489, 0.3930,  ..., 0.3200, 0.3209, 0.4171]],

        ...,

        [[0.4505, 0.3733, 0.3519,  ..., 0.3940, 0.3945, 0.3639],
         [0.5495, 0.6267, 0.6481,  ..., 0.6060, 0.6055, 0.6361]],

        [[0.2328, 0.2391, 0.2467,  ..., 0.1990, 0.2513, 0.2438],
         [0.7672, 0.7609, 0.7533,  ..., 0.8010, 0.7487, 0.7562]],

        [[0.4865, 0.5435

In [121]:
for m in ans:
    print(m.predict(testSet.iloc[398,:-1])[1])

tensor(1)


tensor(1)


tensor(1)


tensor(1)


tensor(1)


tensor(1)


In [118]:
testSet[testSet.label == 1]

Unnamed: 0,LOC_BLANK,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_ERROR_EST,HALSTEAD_LEVEL,NUM_UNIQUE_OPERATORS,label
6623,0.051454,0.101852,0.020349,0.098081,0.062344,0.073171,0.253677,0.129603,0.096846,0.02,0.085158,True
836,0.006711,0.000000,0.000000,0.002132,0.000000,0.000000,0.027309,0.036896,0.002968,0.06,0.021898,True
708,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.011496,0.007174,0.000371,0.33,0.014599,True
1438,0.004474,0.000000,0.014535,0.008529,0.002494,0.024390,0.026028,0.023027,0.001855,0.10,0.034063,True
398,0.006711,0.000000,0.000000,0.000000,0.000000,0.000000,0.023588,0.019919,0.001484,0.12,0.024331,True
...,...,...,...,...,...,...,...,...,...,...,...,...
165,0.002237,0.000000,0.000000,0.002132,0.002494,0.000000,0.069307,0.017097,0.003340,0.14,0.026764,True
5107,0.006711,0.018519,0.005814,0.021322,0.014963,0.012195,0.077398,0.049330,0.011132,0.05,0.055961,True
5029,0.013423,0.027778,0.005814,0.008529,0.004988,0.000000,0.053003,0.045433,0.007050,0.05,0.043796,True
6540,0.020134,0.000000,0.002907,0.004264,0.000000,0.000000,0.044737,0.054830,0.007050,0.04,0.038929,True


In [107]:
a = 0
b = 0
for x in fp:
    if x[0] == 1:
        a+=1
    else:
        b+=1
print(a,b)

0 960


In [105]:
t.T

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int8)

In [70]:
fp[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
        0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
        0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,

In [73]:
t = t.T

In [79]:
t[100:120]

tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]], dtype=torch.int8)

In [85]:
tr = (t[0] == 1).sum(dim=0)
fl = (t[0] == 0).sum(dim=0)


In [86]:
tr,fl

(tensor(0), tensor(7))

In [None]:
finalpreds = []
for i in range(fp[0].shape):
    t = 0
    f = 0
    p = None
    for j in range(len(fp)):
        if fp[j][0][i] == 1:
            t+=1
        else:
            f+=1
    if t > f:
        p = True
    else:
        p = False

    Wmj = f/len(fp) 
    Wmn = t/len(fp)
finalpreds.append((p,Wmj,Wmn))   
    
        


In [30]:
dl_test = ans[0].dls.test_dl(testSet, with_labels=true)
preds = ans[0].get_preds(dl=dl_test)
dl_test1 = ans[1].dls.test_dl(testSet, with_labels=true)
preds1 = ans[1].get_preds(dl=dl_test)

g = preds[1].T
h = preds1[1].T

g = g.squeeze()
h = h.squeeze()

In [32]:
g.shape,h.shape

(torch.Size([960]), torch.Size([960]))

In [43]:
torch.stack([g,h,g]).shape

torch.Size([3, 960])

In [35]:
ans

[<fastai.tabular.learner.TabularLearner at 0x7fc46beaae20>,
 <fastai.tabular.learner.TabularLearner at 0x7fc46beeb7c0>,
 <fastai.tabular.learner.TabularLearner at 0x7fc46be2f6d0>,
 <fastai.tabular.learner.TabularLearner at 0x7fc46bdf4a60>,
 <fastai.tabular.learner.TabularLearner at 0x7fc46be551c0>,
 <fastai.tabular.learner.TabularLearner at 0x7fc46be12a30>,
 <fastai.tabular.learner.TabularLearner at 0x7fc46bda0f70>]