In [1]:
import pandas as pd
import numpy as np
import pprint
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer

#for market basket analysis (using apriori)
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

#for preprocessing
from mlxtend.preprocessing import TransactionEncoder

In [2]:
def readCSVtoDF(csvData):
    return pd.read_excel(csvData)

rawData = 'data/Data_Cortex_Nuclear.xls'

DF = readCSVtoDF(rawData)

In [3]:
DF['MouseNumber'] = DF.MouseID.apply(lambda x: x.split('_')[0]) 
DF['MeasurementNumber'] = DF.MouseID.apply(lambda x: x.split('_')[1])
DF.drop(['MouseID'], axis=1, inplace=True)  

In [4]:
#replacing missing values with mean values of each column
meanValues = DF.groupby(by=['class']).mean().reset_index()  #mean values of numerical attributes

classValue = 'class'
def handleNullValues(row, column):
    value = row[column]
    if pd.isnull(value):
        return meanValues.loc[meanValues[classValue] == row[classValue], column], True
    return value, False

for i, row in DF.iterrows():  #series for each row
    for column in DF.columns:
        row[column], handled = handleNullValues(row, column)
    DF.iloc[i] = row

In [5]:
def minMaxNormalization(df):
    # copy the dataframe
    df_norm = df.copy()
    # apply min-max scaling
    for column in df_norm.columns[:77]:  #for numeric values
        df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())
        
    return df_norm
    
# call the min_max_scaling function
normalizedDF = minMaxNormalization(DF)

normalizedDF

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,MouseNumber,MeasurementNumber
0,0.151122,0.212885,0.824638,0.612119,0.630482,0.327006,0.448666,0.168257,0.617322,0.232553,...,0.087715,0.102890,0.084580,0.705738,Control,Memantine,C/S,c-CS-m,309,1
1,0.155750,0.188226,0.776455,0.601070,0.585247,0.311887,0.429899,0.154925,0.590173,0.205362,...,0.080692,0.115874,0.093977,0.749771,Control,Memantine,C/S,c-CS-m,309,2
2,0.153459,0.205696,0.793572,0.558911,0.575910,0.306369,0.441381,0.153485,0.607102,0.199194,...,0.080465,0.109050,0.082162,0.868229,Control,Memantine,C/S,c-CS-m,309,3
3,0.125169,0.157688,0.637326,0.468152,0.480646,0.335530,0.444307,0.132074,0.486945,0.205135,...,0.126763,0.164241,0.144543,0.721879,Control,Memantine,C/S,c-CS-m,309,4
4,0.122146,0.157838,0.637787,0.426467,0.441977,0.314976,0.433100,0.129086,0.410194,0.189152,...,0.096959,0.136298,0.149281,0.812053,Control,Memantine,C/S,c-CS-m,309,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,0.046197,0.092577,0.365672,0.313684,0.127872,0.312096,0.423753,0.186230,0.488562,0.110645,...,0.597619,0.388482,0.725820,0.504334,Ts65Dn,Saline,S/C,t-SC-s,J3295,11
1076,0.053509,0.097062,0.357235,0.342243,0.157739,0.395191,0.468154,0.190968,0.535204,0.114722,...,0.614760,0.371843,0.614028,0.504110,Ts65Dn,Saline,S/C,t-SC-s,J3295,12
1077,0.035163,0.063556,0.311370,0.165795,0.071655,0.330863,0.384915,0.105509,0.374040,0.080099,...,0.603927,0.484631,0.811962,0.547100,Ts65Dn,Saline,S/C,t-SC-s,J3295,13
1078,0.032018,0.071071,0.337173,0.224787,0.095856,0.306129,0.433086,0.121245,0.409605,0.086829,...,0.640497,0.571336,0.844450,0.529739,Ts65Dn,Saline,S/C,t-SC-s,J3295,14


In [6]:
#converting into format of transaction by TransactionEncoder()
observations = [] 
for i in range(len(normalizedDF)):
    observations.append([str(normalizedDF.values[i,j]) for j in range(83)])
    
observations

[['0.15112239695245439',
  '0.21288505423836743',
  '0.8246378617106147',
  '0.6121193933605023',
  '0.6304823165775357',
  '0.3270058695146853',
  '0.4486663368600233',
  '0.16825719486969132',
  '0.6173220532907517',
  '0.232552785602037',
  '0.1576433840059076',
  '0.5761668226865776',
  '0.7476881196185753',
  '0.598503711911627',
  '0.5747745934891252',
  '0.28669707326349114',
  '0.6506370545005071',
  '0.6981641292727225',
  '0.6227841463852611',
  '0.711198438908822',
  '0.136915065347983',
  '0.41964853740606606',
  '0.3171488772759522',
  '0.5937148262220949',
  '0.6279069014285815',
  '0.5962687482403333',
  '0.6401673629489033',
  '0.6430700625960541',
  '0.7661461416817851',
  '0.29559591981830274',
  '0.5498170737194192',
  '0.7474408486274219',
  '0.09203488607996972',
  '0.5411956554070975',
  '0.1523378406563274',
  '0.6872692643314262',
  '0.5539216745681356',
  '0.46727791533377205',
  '0.5099961024356087',
  '0.4694612591030045',
  '0.4361703027175652',
  '0.5060175

In [7]:
### Transaction dataframe

te=TransactionEncoder()
data=te.fit(observations).transform(observations)
data=pd.DataFrame(data,columns=te.columns_,dtype=int)
data

Unnamed: 0,0.0,0.0006492970970075654,0.0009422525890650537,0.0010551509421525074,0.0012029124645747352,0.0014619331648573423,0.0015326347329770447,0.0015803605987174136,0.001685447540913324,0.002244072664366218,...,Saline,Ts65Dn,c-CS-m,c-CS-s,c-SC-m,c-SC-s,t-CS-m,t-CS-s,t-SC-m,t-SC-s
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
1076,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
1077,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
1078,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1


In [8]:
#let us return items and ietmsets with atleast 5% support:
freq_items_apriori=apriori(data,min_support=0.01,use_colnames=True)
freq_items_apriori

Unnamed: 0,support,itemsets
0,0.039815,(0.0)
1,0.027778,(0.15581240290360923)
2,0.027778,(0.16459743836166765)
3,0.013889,(0.17458805769938718)
4,0.013889,(0.18193913783512503)
...,...,...
7528,0.013889,"(Saline, S/C, c-SC-s, 0.2898856092881807, Cont..."
7529,0.013889,"(Memantine, S/C, 3418, Ts65Dn, 0.3661996654945..."
7530,0.013889,"(Memantine, S/C, Ts65Dn, 0.3661996654945221, 3..."
7531,0.013889,"(Memantine, S/C, 0.38139057285412975, 3420, 0...."


In [26]:
res=association_rules(freq_items_apriori,metric="lift",min_threshold=0.5)
res

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(13),(0.0),0.066667,0.039815,0.010185,0.152778,3.837209,0.007531,1.133333
1,(0.0),(13),0.039815,0.066667,0.010185,0.255814,3.837209,0.007531,1.254167
2,(C/S),(0.0),0.486111,0.039815,0.024074,0.049524,1.243854,0.004720,1.010215
3,(0.0),(C/S),0.039815,0.486111,0.024074,0.604651,1.243854,0.004720,1.299837
4,(0.0),(Control),0.039815,0.527778,0.023148,0.581395,1.101591,0.002135,1.128086
...,...,...,...,...,...,...,...,...,...
242919,(50810F),"(Saline, 0.2539845113729494, C/S, 0.2390567308...",0.013889,0.013889,0.013889,1.000000,72.000000,0.013696,inf
242920,(0.2909731140722411),"(Saline, 0.2539845113729494, C/S, 0.2390567308...",0.041667,0.013889,0.013889,0.333333,24.000000,0.013310,1.479167
242921,(Control),"(Saline, 0.2539845113729494, C/S, 0.2390567308...",0.527778,0.013889,0.013889,0.026316,1.894737,0.006559,1.012763
242922,(0.16459743836166765),"(Saline, 0.2539845113729494, C/S, 0.2390567308...",0.027778,0.013889,0.013889,0.500000,36.000000,0.013503,1.972222


In [10]:
#ECLAT 
#Unlike the a priori method, the ECLAT method is not based on the calculation of confidence and lift,
#therefore the ECLAT method is based on the calculation of the support conjunctions of the variables.
#searches in DFS manner 
#It is a more efficient and scalable version of the Apriori algorithm because it uses less memory than Apriori algorithm,  typically faster than the Apriori algorithm.

In [11]:
freq_items_eclat=apriori(data,min_support=0.005,use_colnames=True)
result_eclat=association_rules(freq_items_eclat,metric='support',min_threshold= 0.005,support_only=True) 
result_eclat

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(0.23905673086468002),(0.0),,,0.005556,,,,
1,(0.0),(0.23905673086468002),,,0.005556,,,,
2,(0.2539845113729494),(0.0),,,0.005556,,,,
3,(0.0),(0.2539845113729494),,,0.005556,,,,
4,(0.2909731140722411),(0.0),,,0.005556,,,,
...,...,...,...,...,...,...,...,...,...
260639,(50810F),"(Saline, 0.2539845113729494, C/S, 0.2390567308...",,,0.013889,,,,
260640,(0.2909731140722411),"(Saline, 0.2539845113729494, C/S, 0.2390567308...",,,0.013889,,,,
260641,(Control),"(Saline, 0.2539845113729494, C/S, 0.2390567308...",,,0.013889,,,,
260642,(0.16459743836166765),"(Saline, 0.2539845113729494, C/S, 0.2390567308...",,,0.013889,,,,


In [12]:
from mlxtend.frequent_patterns import fpgrowth
fpgrowth=fpgrowth(data,min_support=0.05,use_colnames=True)
res_fpgrowth=association_rules(fpgrowth,metric="lift",min_threshold=1.3)
res_fpgrowth

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(C/S),(c-CS-m),0.486111,0.138889,0.138889,0.285714,2.057143,0.071373,1.205556
1,(c-CS-m),(C/S),0.138889,0.486111,0.138889,1.000000,2.057143,0.071373,inf
2,(c-CS-m),(Control),0.138889,0.527778,0.138889,1.000000,1.894737,0.065586,inf
3,(Control),(c-CS-m),0.527778,0.138889,0.138889,0.263158,1.894737,0.065586,1.168651
4,(c-CS-m),(Memantine),0.138889,0.527778,0.138889,1.000000,1.894737,0.065586,inf
...,...,...,...,...,...,...,...,...,...
1111,"(t-SC-s, S/C)","(Saline, Ts65Dn)",0.125000,0.222222,0.125000,1.000000,4.500000,0.097222,inf
1112,(Saline),"(Ts65Dn, S/C, t-SC-s)",0.472222,0.125000,0.125000,0.264706,2.117647,0.065972,1.190000
1113,(Ts65Dn),"(Saline, S/C, t-SC-s)",0.472222,0.125000,0.125000,0.264706,2.117647,0.065972,1.190000
1114,(S/C),"(Saline, Ts65Dn, t-SC-s)",0.513889,0.125000,0.125000,0.243243,1.945946,0.060764,1.156250


In [13]:
#######Comparing##########


# **** Apriori ****
import time
l=[0.01,0.02,0.03,0.04,0.05]
time_needed_apriori=[]
for i in l:
    t1=time.time()
    L = apriori(data,min_support=i,use_colnames=True)
    t2=time.time()
    time_needed_apriori.append((t2-t1)*1000)
    
time_needed_eclat=[]
for i in l:
    t1=time.time()
    K = apriori(data,min_support=i,use_colnames=True)
    t2=time.time()
    time_needed_eclat.append((t2-t1)*1000)



In [14]:
time_needed_fpgrowth=[]
k=[0.01,0.02,0.03,0.04,0.05]
for i in k:
    t1=time.time()
    y= fpgrowth(data,min_support=i,use_colnames=True)
    t2=time.time()
    time_needed_fpgrowth.append((t2-t1)*1000)




TypeError: 'DataFrame' object is not callable

In [None]:
sns.lineplot(x=l,y=time_needed_fpgrowth,label="fpgrowth")
sns.lineplot(x=l,y=time_needed_eclat,label="eclat")
sns.lineplot(x=l,y=time_needed_apriori,label="apriori")
plt.xlabel("Min_support")
plt.ylabel("Run Time in ms")