In [1]:
# Import libraries
import pandas as pd
import numpy as np
import warnings
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore")

In [2]:
# load dataset
df = pd.read_csv('mice_protein.csv')

In [3]:
#Visualize dataset
df.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class
0,309_1,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,...,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,Control,Memantine,C/S,c-CS-m
1,309_2,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,...,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,Control,Memantine,C/S,c-CS-m
2,309_3,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,...,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,Control,Memantine,C/S,c-CS-m
3,309_4,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,...,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,Control,Memantine,C/S,c-CS-m
4,309_5,0.43494,0.61743,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,...,0.110694,0.434154,0.118481,0.140314,0.14838,1.83973,Control,Memantine,C/S,c-CS-m


In [4]:
# No. of rows and columns
df.shape

(1080, 82)

In [5]:
# No.of null valuesin each column
df_null = df.isna().sum() 
df_null.sort_values(ascending=False)

BCL2_N             285
H3MeK4_N           270
BAD_N              213
EGR1_N             210
H3AcK18_N          180
pCFOS_N             75
ELK_N               18
Bcatenin_N          18
MEK_N                7
pNUMB_N              3
TRKA_N               3
JNK_N                3
ERK_N                3
GSK3B_N              3
pP70S6_N             3
CREB_N               3
RSK_N                3
APP_N                3
TIAM1_N              3
RAPTOR_N             3
BRAF_N               3
NR2B_N               3
AMPKA_N              3
DSCR1_N              3
pMTOR_N              3
P38_N                3
MTOR_N               3
CAMKII_N             3
pNR2B_N              3
AKT_N                3
                  ... 
Treatment            0
Genotype             0
CaNA_N               0
SYP_N                0
pS6_N                0
SHH_N                0
pGSK3B_Tyr216_N      0
Ubiquitin_N          0
SNCA_N               0
PSD95_N              0
pCASP9_N             0
P3525_N              0
IL1B_N     

In [6]:
# Delete unused column 
df = df.drop('MouseID',axis=1)

In [7]:
# Drop rows which more null values
df.drop(df[df['MEK_N'].isnull()].index, inplace=True)

In [8]:
# filling the remaining rows with before values
df=df.fillna(method ='bfill')

In [9]:
#dropping null values
df = df.dropna(how='any',axis=0) 

In [10]:
# Verifying all null values are cleared
df_null = df.isna().sum() 
df_null.sort_values(ascending=False)

class           0
pNUMB_N         0
CAMKII_N        0
CREB_N          0
ELK_N           0
ERK_N           0
GSK3B_N         0
JNK_N           0
MEK_N           0
TRKA_N          0
RSK_N           0
APP_N           0
Bcatenin_N      0
SOD1_N          0
MTOR_N          0
P38_N           0
pMTOR_N         0
DSCR1_N         0
AMPKA_N         0
BRAF_N          0
AKT_N           0
pRSK_N          0
pCREB_N         0
ITSN1_N         0
BDNF_N          0
NR1_N           0
NR2A_N          0
pAKT_N          0
pBRAF_N         0
pCAMKII_N       0
               ..
pS6_N           0
pCFOS_N         0
SYP_N           0
H3AcK18_N       0
EGR1_N          0
H3MeK4_N        0
CaNA_N          0
Genotype        0
Treatment       0
P3525_N         0
IL1B_N          0
GluR4_N         0
ADARB1_N        0
pP70S6_N        0
NUMB_N          0
P70S6_N         0
pGSK3B_N        0
pPKCG_N         0
CDK5_N          0
S6_N            0
AcetylH3K9_N    0
GluR3_N         0
RRP1_N          0
BAX_N           0
ARC_N     

In [11]:
#rows and columns after pre-processing
df.shape

(1043, 81)

In [12]:
# Data type of all columns
print(df.dtypes)

DYRK1A_N           float64
ITSN1_N            float64
BDNF_N             float64
NR1_N              float64
NR2A_N             float64
pAKT_N             float64
pBRAF_N            float64
pCAMKII_N          float64
pCREB_N            float64
pELK_N             float64
pERK_N             float64
pJNK_N             float64
PKCA_N             float64
pMEK_N             float64
pNR1_N             float64
pNR2A_N            float64
pNR2B_N            float64
pPKCAB_N           float64
pRSK_N             float64
AKT_N              float64
BRAF_N             float64
CAMKII_N           float64
CREB_N             float64
ELK_N              float64
ERK_N              float64
GSK3B_N            float64
JNK_N              float64
MEK_N              float64
TRKA_N             float64
RSK_N              float64
                    ...   
RRP1_N             float64
BAX_N              float64
ARC_N              float64
ERBB4_N            float64
nNOS_N             float64
Tau_N              float64
G

In [13]:
# Mapping categorical values to numerical values
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder() 
df['Genotype']= label_encoder.fit_transform(df['Genotype']) 
df['Treatment']= label_encoder.fit_transform(df['Treatment']) 
df['Behavior']= label_encoder.fit_transform(df['Behavior']) 
df['class']= label_encoder.fit_transform(df['class'])

In [14]:
# float to int conversion
df= df.astype(int)

In [15]:
#Verifying the data-type
print(df.dtypes)

DYRK1A_N           int32
ITSN1_N            int32
BDNF_N             int32
NR1_N              int32
NR2A_N             int32
pAKT_N             int32
pBRAF_N            int32
pCAMKII_N          int32
pCREB_N            int32
pELK_N             int32
pERK_N             int32
pJNK_N             int32
PKCA_N             int32
pMEK_N             int32
pNR1_N             int32
pNR2A_N            int32
pNR2B_N            int32
pPKCAB_N           int32
pRSK_N             int32
AKT_N              int32
BRAF_N             int32
CAMKII_N           int32
CREB_N             int32
ELK_N              int32
ERK_N              int32
GSK3B_N            int32
JNK_N              int32
MEK_N              int32
TRKA_N             int32
RSK_N              int32
                   ...  
RRP1_N             int32
BAX_N              int32
ARC_N              int32
ERBB4_N            int32
nNOS_N             int32
Tau_N              int32
GFAP_N             int32
GluR3_N            int32
GluR4_N            int32


In [16]:
#Visualization of data
df.head()

Unnamed: 0,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,pELK_N,...,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class
0,0,0,0,2,5,0,0,2,0,1,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,2,5,0,0,2,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,2,5,0,0,2,0,1,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,2,4,0,0,2,0,1,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,2,4,0,0,2,0,1,...,0,0,0,0,0,1,0,0,0,0


In [17]:
df.shape

(1043, 81)

In [18]:
#Counting number of class from class column
distribution = pd.crosstab(index=df["class"],columns="count")  
distribution

col_0,count
class,Unnamed: 1_level_1
0,150
1,131
2,150
3,135
4,135
5,105
6,135
7,102


In [19]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(df.loc[:,"DYRK1A_N":"Behavior"], df["class"], test_size=0.2,random_state=1)

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
xtrain= scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

## One vs Rest Classifier

In [21]:
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(SVC()).fit(xtrain, ytrain)
yprep=clf.predict(xtest)
print('Test accuracy is {}'.format(accuracy_score(yprep,ytest)))

Test accuracy is 0.9952153110047847


## One vs One Classifier 

In [22]:
from sklearn.multiclass import OneVsOneClassifier
clf=OneVsOneClassifier(SVC()).fit(xtrain, ytrain)
yprep=clf.predict(xtest)
print('Test accuracy is {}'.format(accuracy_score(yprep,ytest)))

Test accuracy is 0.9952153110047847
