In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import svm

#### Read the dataset

In [2]:
from sklearn.utils import shuffle
df = pd.read_csv('APIC_ESTEE_LDA_SVM.csv')
# df.dtypes
np.random.seed(255)
df = shuffle(df)

In [3]:
# df.columns
# select the features and outcomes for further analyses

module1_df = df.loc[:, ['PM2.5_group', 'BC_group', 'mcp_1', 'leptin', 'PAI_1', 'TNFa']]
module1_df.columns = ['PM2.5_Group', 'BC_group', 'MCP_1', 'leptin', 'PAI_1', 'TNF_a']

module1_data = module1_df.loc[:, ['MCP_1', 'leptin', 'PAI_1', 'TNF_a']]
module1_target = module1_df.loc[:, ['PM2.5_Group']]

In [4]:
# standarlize the module1_data
def standarlize(arr):
    arr_mean = np.mean(arr)
    arr_sd = np.std(arr)
    return (arr - arr_mean) / arr_sd

# test_arr = np.array([1, 2, 3, 4, 5])
# standarlize(test_arr)

module1_data = module1_data.apply(standarlize, axis=0)
module1_data.head()

Unnamed: 0,MCP_1,leptin,PAI_1,TNF_a
93,-0.388082,1.515559,-0.718016,0.410662
88,-0.60497,0.188102,-0.93021,-0.509453
92,-0.948941,0.169406,-0.12487,-0.291531
39,0.859365,-0.560117,-0.026916,-0.243104
75,1.042365,0.350673,0.976587,-0.044553


#### Try to SVM on module1_df: 10-fold cross-validation

In [5]:
from sklearn.model_selection import train_test_split

module1_data_train, module1_data_validate, module1_target_train, module1_target_test = train_test_split(module1_data, module1_target, test_size=0.4, random_state=0)

In [6]:
print(module1_data_train.shape, "  ", module1_target_train.shape)
print(module1_data_validate.shape, "  ", module1_target_test.shape)

(78, 4)    (78, 1)
(52, 4)    (52, 1)


In [7]:
# help(svm.SVC)
# AND fit SVM with linear kernel

# apic_svm = svm.SVC(kernel='linear', C=1).fit(module1_data_train, module1_target_train)
# apic_svm.score(module1_data_validate, module1_target_test)
# score : 0.61

# we may change the kernel
apic_svm = svm.SVC(kernel='sigmoid', C=1).fit(module1_data_train, np.ravel(module1_target_train))
apic_svm.score(module1_data_validate, np.ravel(module1_target_test))

0.6730769230769231

In [8]:
# have a try on ravel() and flatten()

test_ravel = np.array([[1, 2, 3], [4, 5, 6]])
print(test_ravel.ravel())
print(test_ravel.flatten())

[1 2 3 4 5 6]
[1 2 3 4 5 6]


In [9]:
from sklearn.model_selection import cross_val_score

apic_svm = svm.SVC(kernel='poly', C=1)

# this is important
scores = cross_val_score(apic_svm, module1_data, np.ravel(module1_target), cv=5, scoring='accuracy')
print("Scores: ", scores)

Scores:  [0.73076923 0.69230769 0.73076923 0.69230769 0.73076923]


In [10]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.72 (+/- 0.04)


In [11]:
# Another option is to use an iterable yielding (train, test) splits as arrays of indices
# review iterator: to design a iterator and output 0 1 1 2 3 5 8 ...

class fib_iterator:
    def __iter__(self):
        self.curr_num = 0
        self.next_num = 1
        return self
    
    def __next__(self):
        if self.curr_num < 500:
            x = self.curr_num
            self.curr_num = self.next_num
            self.next_num = self.curr_num + x
            return x
        else:
            raise StopIteration
    
myiter = iter(fib_iterator())

'''
for x in myiter:
    print(x)
'''

'\nfor x in myiter:\n    print(x)\n'

#### To apply LDA on the apic dataset

In [12]:
# a review on the dataset
print(module1_data.shape)
type(module1_target)
module1_target = np.ravel(module1_target)
print(module1_target)

(130, 4)
[0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1
 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1
 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0]


In [13]:
# apply 10-fold cross-validation on the apic dataset
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import random
random.seed(111)

apic_lda = LDA(n_components=1, solver="svd")

scores = cross_val_score(apic_lda, module1_data, np.ravel(module1_target), cv=10, scoring='accuracy')
print(scores)
print("Mean of the score is %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores)))

[0.76923077 0.69230769 0.69230769 0.61538462 0.76923077 0.76923077
 0.76923077 0.69230769 0.61538462 0.69230769]
Mean of the score is 0.71 (+/- 0.06)


In [14]:
# try to get the parameters
apic_lda_no_cv = LDA(n_components=1)
apic_lda_no_cv.fit(module1_data, np.ravel(module1_target))
apic_lda_no_cv.score(module1_data, np.ravel(module1_target))

print(module1_data.columns)
print(apic_lda_no_cv.coef_)

Index(['MCP_1', 'leptin', 'PAI_1', 'TNF_a'], dtype='object')
[[ 0.2094876   0.36170297  0.04058519 -0.47559714]]


In [15]:
# the function to analyze the apic-estee dataset by LDA and SVM
# Obesity-inflammaiton to classify PM2.5

from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

def apic_lda_vs_svm(training_set=module1_data, validation_set=module1_target, scoring='accuracy'):
    temp_svm_model = SVC(kernel='poly', C=1)
    temp_lda_model = LDA(solver='svd', n_components=1)
    scores_svm = cross_val_score(temp_svm_model, training_set, np.ravel(validation_set), cv=5, scoring=scoring)
    print("Scores of SVM: ", scores_svm)
    scores_lda = cross_val_score(temp_lda_model, training_set, np.ravel(validation_set), cv=5, scoring=scoring)
    print("Scores of LDA: ", scores_lda)
    # print the results of the 2 models
    print("Results of SVM is : {:.3f}+/-{:.3f}".format(np.mean(scores_svm), np.std(scores_svm)))
    print("Results of LDA is : {:.3f}+/-{:.3f}".format(np.mean(scores_lda), np.std(scores_lda)))

    apic_lda_no_cv = LDA(n_components=1)
    apic_lda_no_cv.fit(training_set, np.ravel(validation_set))
    print("Colnames : ", training_set.columns)
    print("Coeff : ", apic_lda_no_cv.coef_)

# apply func
apic_lda_vs_svm(module1_data, module1_target)

Scores of SVM:  [0.73076923 0.69230769 0.73076923 0.69230769 0.73076923]
Scores of LDA:  [0.73076923 0.65384615 0.76923077 0.73076923 0.73076923]
Results of SVM is : 0.715+/-0.019
Results of LDA is : 0.723+/-0.038
Colnames :  Index(['MCP_1', 'leptin', 'PAI_1', 'TNF_a'], dtype='object')
Coeff :  [[ 0.2094876   0.36170297  0.04058519 -0.47559714]]


In [16]:
# Platelet activation to classify PM2.5/BC
df = pd.read_csv('APIC_ESTEE_LDA_SVM.csv')
# print(df.columns)
df = shuffle(df)

module1_df = df.loc[:, ['PM2.5_group', 'BC_group', 'PLT', 'MPV', 'CD40L', 'p_selectin']]
module1_df.columns = ['PM2.5_Group', 'BC_group', 'PLT', 'MPV', 'CD40L', 'p_selectin']

module1_data = module1_df.loc[:, ['PLT', 'MPV', 'CD40L', 'p_selectin']]
module1_target_pm = module1_df.loc[:, ['PM2.5_Group']]
module1_target_bc = module1_df.loc[:, ['BC_group']]

module1_data = module1_data.apply(standarlize, axis=0)

apic_lda_vs_svm(module1_data, module1_target_pm)
apic_lda_vs_svm(module1_data, module1_target_bc)

Scores of SVM:  [0.76923077 0.76923077 0.73076923 0.73076923 0.73076923]
Scores of LDA:  [0.76923077 0.76923077 0.73076923 0.73076923 0.73076923]
Results of SVM is : 0.746+/-0.019
Results of LDA is : 0.746+/-0.019
Colnames :  Index(['PLT', 'MPV', 'CD40L', 'p_selectin'], dtype='object')
Coeff :  [[-0.08975294  0.04359842 -0.2168749  -0.34279018]]
Scores of SVM:  [0.76923077 0.76923077 0.76923077 0.73076923 0.73076923]
Scores of LDA:  [0.76923077 0.76923077 0.76923077 0.73076923 0.73076923]
Results of SVM is : 0.754+/-0.019
Results of LDA is : 0.754+/-0.019
Colnames :  Index(['PLT', 'MPV', 'CD40L', 'p_selectin'], dtype='object')
Coeff :  [[-0.08669557  0.13870304 -0.26207041 -0.41704979]]


In [17]:
# Insulin resistance to classify PM2.5/BC
df = pd.read_csv('APIC_ESTEE_LDA_SVM.csv')
df = shuffle(df)

module1_df = df.loc[:, ['PM2.5_group', 'BC_group', 'c_peptide', 'HOMA_IR']]
module1_df.columns = ['PM2.5_Group', 'BC_group', 'c_peptide', 'HOMA_IR']

module1_data = module1_df.loc[:, ['c_peptide', 'HOMA_IR']]
module1_target_pm = module1_df.loc[:, ['PM2.5_Group']]
module1_target_bc = module1_df.loc[:, ['BC_group']]

module1_data = module1_data.apply(standarlize, axis=0)

apic_lda_vs_svm(module1_data, module1_target_pm)
apic_lda_vs_svm(module1_data, module1_target_bc)

Scores of SVM:  [0.76923077 0.76923077 0.73076923 0.80769231 0.73076923]
Scores of LDA:  [0.76923077 0.76923077 0.76923077 0.76923077 0.65384615]
Results of SVM is : 0.762+/-0.029
Results of LDA is : 0.746+/-0.046
Colnames :  Index(['c_peptide', 'HOMA_IR'], dtype='object')
Coeff :  [[ 0.95544237 -0.67772451]]
Scores of SVM:  [0.76923077 0.76923077 0.76923077 0.73076923 0.73076923]
Scores of LDA:  [0.76923077 0.76923077 0.73076923 0.76923077 0.65384615]
Results of SVM is : 0.754+/-0.019
Results of LDA is : 0.738+/-0.045
Colnames :  Index(['c_peptide', 'HOMA_IR'], dtype='object')
Coeff :  [[ 0.76872365 -0.5378133 ]]
