In [205]:
# Splice Junction Classification
import pandas as pd
import numpy as np

In [206]:
df = pd.read_csv("splice.data",names = ['class','names','sequence'])

In [207]:
print (df.head())

  class                   names  \
0    EI        ATRINS-DONOR-521   
1    EI        ATRINS-DONOR-905   
2    EI        BABAPOE-DONOR-30   
3    EI       BABAPOE-DONOR-867   
4    EI      BABAPOE-DONOR-2817   

                                            sequence  
0                 CCAGCTGCATCACAGGAGGCCAGCGAGCAGG...  
1                 AGACCCGCCGGGAGGCGGAGGACCTGCAGGG...  
2                 GAGGTGAAGGACGTCCTTCCCCAGGAGCCGG...  
3                GGGCTGCGTTGCTGGTCACATTCCTGGCAGGT...  
4               GCTCAGCCCCCAGGTCACCCAGGAACTGACGTG...  


In [208]:
y = df['class']
print(y.head())

0    EI
1    EI
2    EI
3    EI
4    EI
Name: class, dtype: object


In [209]:
# impliment one-hot encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [210]:
# creat instance of labelencoder
labelencoder = LabelEncoder()
# Assign numerical values and store in another column
df['class_num'] = labelencoder.fit_transform(df['class'])
print(df[760:770])

    class                     names  \
760    EI        ORAIGECA-DONOR-378   
761    EI        ORAIGECA-DONOR-904   
762    EI       ORAIGECA-DONOR-1314   
763    EI         TARHBB-DONOR-1560   
764    EI         TARHBB-DONOR-1909   
765    EI          TARHBD-DONOR-468   
766    EI          TARHBD-DONOR-817   
767    IE       ATRINS-ACCEPTOR-701   
768    IE      ATRINS-ACCEPTOR-1678   
769    IE      BABAPOE-ACCEPTOR-801   

                                              sequence  class_num  
760               CAGACTGGGTCTACAACAAAACTTTCGGCGGTA...          0  
761               CCTTTGAGGACAGCACCAAGAAGTGTGCAGGTA...          0  
762              CCCTCGTGCGCTCCACGACCAAGACCAGCGGTGA...          0  
763                GGAAGATGTTGGTGGTGAGGCCCTGGGCAGGT...          0  
764                AAATTGCACGTGGATCCTGAGAATTTCAGGGT...          0  
765                 GGAAGATGTTGGTGGTGAGGCCCTGGGCAGG...          0  
766                 AAGCTGCATGTGGATCCTGAGAACTTCAGGG...          0  
767              TTCAGCGGC

In [211]:
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')
# pass 'class_num' column
enc_df = pd.DataFrame(enc.fit_transform(df[['class_num']]).toarray())
# merge with main df
enc_df.columns = ['A','B','C']
enc_df

Unnamed: 0,A,B,C
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
3185,0.0,0.0,1.0
3186,0.0,0.0,1.0
3187,0.0,0.0,1.0
3188,0.0,0.0,1.0


In [212]:
# add one hot encoder to df
df[['hot_1','hot_2','hot_3']] = enc_df[['A','B','C']]

# eliminate the space in sequence
def delete_space(x):
    if type(x) is str:
        return x.strip()
    else:
        return x
    
df['sequence_1'] = df['sequence'].map(delete_space)
print(df.head())

  class                   names  \
0    EI        ATRINS-DONOR-521   
1    EI        ATRINS-DONOR-905   
2    EI        BABAPOE-DONOR-30   
3    EI       BABAPOE-DONOR-867   
4    EI      BABAPOE-DONOR-2817   

                                            sequence  class_num  hot_1  hot_2  \
0                 CCAGCTGCATCACAGGAGGCCAGCGAGCAGG...          0    1.0    0.0   
1                 AGACCCGCCGGGAGGCGGAGGACCTGCAGGG...          0    1.0    0.0   
2                 GAGGTGAAGGACGTCCTTCCCCAGGAGCCGG...          0    1.0    0.0   
3                GGGCTGCGTTGCTGGTCACATTCCTGGCAGGT...          0    1.0    0.0   
4               GCTCAGCCCCCAGGTCACCCAGGAACTGACGTG...          0    1.0    0.0   

   hot_3                                         sequence_1  
0    0.0  CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCC...  
1    0.0  AGACCCGCCGGGAGGCGGAGGACCTGCAGGGTGAGCCCCACCGCCC...  
2    0.0  GAGGTGAAGGACGTCCTTCCCCAGGAGCCGGTGAGAAGCGCAGTCG...  
3    0.0  GGGCTGCGTTGCTGGTCACATTCCTGGCAGGTATGGGGCGGGGC

In [213]:
# transform raw sequence data into data that can be recognized by machine learning (one-hot encoder)
x_raw = df['sequence_1']
X = []
for sequence in x_raw:
    row = []
    for base in sequence:
        e = None
        if base == 'A':
            e = [0,0,0,1]
        elif base == 'T':
            e = [0,0,1,0]
        elif base == 'C':
            e = [0,1,0,0]
        elif base == 'G':
            e = [1,0,0,0]
        else:
            e = [0,0,0,0]
        row += e
    X.append(row)
print(X[1900])

[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0]


In [214]:
# y = df[['hot_1','hot_2','hot_3']]
y = df['class']

In [215]:
# impliment train-test-split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [216]:
# impliment cross validation
from sklearn.model_selection import cross_val_score

In [217]:
from sklearn import linear_model
# l2 penalty, c - inverse of regularization strength
lamda = np.logspace(0,20,21,endpoint=True,base = 2)/100
c_set = 1/lamda
for c in c_set:
    
    logistic = linear_model.LogisticRegression(penalty='l2',dual=False,C=c,fit_intercept=True, tol=0.0001,max_iter=1000)
    scores = cross_val_score(logistic, X_train, y_train, cv=5)
    print(scores)
    print('lamda =', 1/c)
    print('Accuracy by l2 penalty: %0.3f (+/- %0.3f)' % (scores.mean(), scores.std() * 2))
    print("   ")

[0.93001842 0.92250923 0.94095941 0.90590406 0.91143911]
lamda = 0.01
Accuracy by l2 penalty: 0.922 (+/- 0.025)
   
[0.93554328 0.92250923 0.94095941 0.90590406 0.91143911]
lamda = 0.02
Accuracy by l2 penalty: 0.923 (+/- 0.027)
   
[0.9373849  0.92435424 0.93911439 0.9095941  0.91328413]
lamda = 0.04
Accuracy by l2 penalty: 0.925 (+/- 0.024)
   
[0.93922652 0.92435424 0.94280443 0.9095941  0.91328413]
lamda = 0.08
Accuracy by l2 penalty: 0.926 (+/- 0.027)
   
[0.94290976 0.92619926 0.94280443 0.9095941  0.92066421]
lamda = 0.16
Accuracy by l2 penalty: 0.928 (+/- 0.026)
   
[0.946593   0.9298893  0.94649446 0.91697417 0.92435424]
lamda = 0.32
Accuracy by l2 penalty: 0.933 (+/- 0.024)
   
[0.946593   0.93911439 0.9501845  0.91881919 0.92804428]
lamda = 0.64
Accuracy by l2 penalty: 0.937 (+/- 0.023)
   
[0.95395948 0.94649446 0.95387454 0.92250923 0.93173432]
lamda = 1.28
Accuracy by l2 penalty: 0.942 (+/- 0.025)
   
[0.96132597 0.95387454 0.95756458 0.92066421 0.93173432]
lamda = 2.56
Ac

In [218]:
print('Compare between lamda = 20.48,40.96,81.92,163.84 ')

Compare between lamda = 20.48,40.96,81.92,163.84 


In [219]:
lamda_set = [20.48,40.96,81.92,163.84 ]

for lamda in lamda_set:
    logistic = linear_model.LogisticRegression(penalty='l2',dual=False,C=1/lamda,fit_intercept=True, tol=0.0001,max_iter=1000)
    logistic.fit(X_train, y_train)
    y_pred = logistic.predict(X_test)
    accurate = np.sum(y_pred == y_test)
    accuracy = accurate/len(y_test)
    print('lamda = ',lamda, ', accuracy =', accuracy)

lamda =  20.48 , accuracy = 0.954070981210856
lamda =  40.96 , accuracy = 0.9519832985386222
lamda =  81.92 , accuracy = 0.9519832985386222
lamda =  163.84 , accuracy = 0.9561586638830898


In [220]:
print('Best model when lamda = 163.84. Accuracy by l2 penalty: 0.96' )

Best model when lamda = 163.84. Accuracy by l2 penalty: 0.96


In [221]:
# l1 penalty
#logistic = linear_model.LogisticRegression(penalty='l1',dual=False,C=1,fit_intercept=True, solver='liblinear',tol=0.0001,max_iter=1000)
#scores = cross_val_score(logistic, X_train, y_train, cv=5)
#print(scores)
lamda = np.logspace(0,20,21,endpoint=True,base = 2)/100
c_set = 1/lamda
for c in c_set:
    
    logistic = linear_model.LogisticRegression(penalty='l1',dual=False,C=c,fit_intercept=True, solver='liblinear',tol=0.0001,max_iter=1000)
    scores = cross_val_score(logistic, X_train, y_train, cv=5)
    print(scores)
    print('lamda =', 1/c)
    print('Accuracy by l1 penalty:',format(scores.mean(),'.3f'),'+/-',format(scores.std() * 2,'.3f'))
    print("   ")

[0.94290976 0.93357934 0.94095941 0.92250923 0.92804428]
lamda = 0.01
Accuracy by l1 penalty: 0.934 +/- 0.015
   
[0.93922652 0.93911439 0.94649446 0.92435424 0.92804428]
lamda = 0.02
Accuracy by l1 penalty: 0.935 +/- 0.016
   
[0.94106814 0.93911439 0.9501845  0.92619926 0.92804428]
lamda = 0.04
Accuracy by l1 penalty: 0.937 +/- 0.018
   
[0.94106814 0.94280443 0.95202952 0.93173432 0.93173432]
lamda = 0.08
Accuracy by l1 penalty: 0.940 +/- 0.015
   
[0.94843462 0.94464945 0.9501845  0.92619926 0.93911439]
lamda = 0.16
Accuracy by l1 penalty: 0.942 +/- 0.017
   
[0.95027624 0.95202952 0.95202952 0.92804428 0.94833948]
lamda = 0.32
Accuracy by l1 penalty: 0.946 +/- 0.018
   
[0.96132597 0.95756458 0.95571956 0.92804428 0.94649446]
lamda = 0.64
Accuracy by l1 penalty: 0.950 +/- 0.024
   
[0.96500921 0.96494465 0.95940959 0.93173432 0.95202952]
lamda = 1.28
Accuracy by l1 penalty: 0.955 +/- 0.025
   
[0.96500921 0.97601476 0.95940959 0.94649446 0.95387454]
lamda = 2.56
Accuracy by l1 pen

In [222]:
print('Compare between lamda = 1.28, 2.56, 5.12, 10.24' )

Compare between lamda = 1.28, 2.56, 5.12, 10.24


In [223]:
lamda_set = [1.28,2.56,5.12,10.24]

for lamda in lamda_set:
    logistic = linear_model.LogisticRegression(penalty='l1',dual=False,C=1/lamda,fit_intercept=True, solver='liblinear',tol=0.0001,max_iter=1000)
    logistic.fit(X_train, y_train)
    y_pred = logistic.predict(X_test)
    accurate = np.sum(y_pred == y_test)
    accuracy = accurate/len(y_test)
    print('lamda = ',lamda, ', accuracy =', accuracy)

lamda =  1.28 , accuracy = 0.9603340292275574
lamda =  2.56 , accuracy = 0.9624217118997912
lamda =  5.12 , accuracy = 0.9665970772442589
lamda =  10.24 , accuracy = 0.9624217118997912


In [224]:
print('Best model when lamda = 5.12. Accuracy by l1 penalty: 0.97' )

Best model when lamda = 5.12. Accuracy by l1 penalty: 0.97


In [226]:
print('Best logistic regression with regularization over all is achieved when using l1 penalty, lamda = 5.12. Accuacy on test data is 0.97.')

Best logistic regression with regularization over all is achieved when using l1 penalty, lamda = 5.12. Accuacy on test data is 0.97.


In [3]:

# Splice Junction Classification
import pandas as pd
import numpy as np
lamda = np.logspace(0,20,21,endpoint=True,base = 2)/100
print(lamda)

[1.000000e-02 2.000000e-02 4.000000e-02 8.000000e-02 1.600000e-01
 3.200000e-01 6.400000e-01 1.280000e+00 2.560000e+00 5.120000e+00
 1.024000e+01 2.048000e+01 4.096000e+01 8.192000e+01 1.638400e+02
 3.276800e+02 6.553600e+02 1.310720e+03 2.621440e+03 5.242880e+03
 1.048576e+04]
