In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from pandas import read_csv
import numpy as np

In [17]:
import plotly.express as px


In [18]:
# import dataset
filename = "bondugula2-6.csv"
raw_data = read_csv(filename, header=1)
print(raw_data.columns)

Index(['Protein', 'No.', 'Res', 'isUnstruct', 'E6', 'E20', 'E22', 'Vkbat',
       'chou_fasman', 'sspro_5', 'gor4', 'dsc', 'jnet', 'psipred',
       '# homologues', 'HAS_H', 'HAS_S', 'HAS_O', 'HAS_U'],
      dtype='object')


In [19]:
# clean dataset
data = raw_data.drop(['E6', 'E20'], axis=1)
data = data[data['Res']!='_']

In [20]:

data['num_X'] = data['HAS_H'] + data['HAS_S'] + data['HAS_O'] + data['HAS_U'] # column with num
data['is_switch'] = np.where(data['num_X'] > 1, 1, 0) # create switch where contains multiple X


In [21]:
data.tail(20)

Unnamed: 0,Protein,No.,Res,isUnstruct,E22,Vkbat,chou_fasman,sspro_5,gor4,dsc,jnet,psipred,# homologues,HAS_H,HAS_S,HAS_O,HAS_U,num_X,is_switch
5040,1CY5,78,E,0.064506,,4.0,Other,Helix,Helix,Helix,Other,Other,20,0,0,1,0,1,0
5041,1CY5,79,G,0.066382,,3.0,Other,Other,Helix,Helix,Other,Other,20,0,0,1,0,1,0
5042,1CY5,80,Y,0.064804,,1.0,Helix,Helix,Helix,Helix,Helix,Helix,20,0,0,0,1,1,0
5043,1CY5,81,K,0.079351,,1.0,Helix,Helix,Helix,Helix,Helix,Helix,20,1,0,0,0,1,0
5044,1CY5,82,D,0.085181,,1.0,Helix,Helix,Helix,Helix,Helix,Helix,20,1,0,0,0,1,0
5045,1CY5,83,L,0.083146,,1.0,Helix,Helix,Helix,Helix,Helix,Helix,20,1,0,0,0,1,0
5046,1CY5,84,A,0.091622,,1.0,Helix,Helix,Helix,Helix,Helix,Helix,20,1,0,0,0,1,0
5047,1CY5,85,A,0.099122,,1.0,Helix,Helix,Helix,Helix,Helix,Helix,20,1,0,0,0,1,0
5048,1CY5,86,L,0.105681,,1.0,Helix,Helix,Helix,Helix,Helix,Helix,20,1,0,0,0,1,0
5049,1CY5,87,L,0.126871,,3.0,Other,Helix,Helix,Other,Helix,Helix,20,1,0,0,0,1,0


In [22]:
# Plotting with plotly

# dot plot
fig = px.scatter(data, x="Protein", y="No.", color=data["is_switch"].astype(str), hover_data=['Res'])
fig.update_traces(marker={"opacity": 1.0})
fig.update_xaxes(type='category')
fig.update_yaxes(type='category')
fig.show()


In [23]:
percent_switches = (data['is_switch'].sum())/(len(data['is_switch']))
print("Percentage of proteins in dataset that are switches is " + str(round(percent_switches*100,2)) + "%")

Percentage of proteins in dataset that are switches is 32.19%


In [24]:
features = ['isUnstruct', 'Vkbat', '# homologues', 'HAS_H', 'HAS_S', 'HAS_O', 'HAS_U'] 
# to include other features need one-hot encoding
print(features)
target = "is_switch"

['isUnstruct', 'Vkbat', '# homologues', 'HAS_H', 'HAS_S', 'HAS_O', 'HAS_U']


In [25]:
# Modeling with KNN

# split data
train_data, test_data = train_test_split(data, test_size=0.25, random_state=0)
X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

# Classic Logistic Regression Model with all quantitative features

In [27]:
# Fit the model on training data, predict on test data
logistic_model = LogisticRegression().fit(X_train, y_train)

In [28]:
preds = logistic_model.predict(X_test)
prob_preds = logistic_model.predict_proba(X_test)
                               

In [29]:
print(features)
print("Coefficients: " + str(logistic_model.coef_))
print("Intercept: " + str(logistic_model.intercept_))

['isUnstruct', 'Vkbat', '# homologues', 'HAS_H', 'HAS_S', 'HAS_O', 'HAS_U']
Coefficients: [[2.02478311e-01 1.73415444e-02 6.53282846e-03 7.03349676e+00
  7.12697180e+00 8.06881093e+00 7.82922667e+00]]
Intercept: [-12.09191741]


In [30]:
print(preds)
print(y_test)
print((preds - y_test).sum())

[1 0 1 ... 0 0 1]
4514    1
1843    0
383     1
304     0
4228    0
       ..
488     0
3702    0
3694    0
3730    0
4352    1
Name: is_switch, Length: 1259, dtype: int64
0


# L1 Logistic Model 

In [31]:
l1_logistic = LogisticRegression(penalty = 'l1', solver='liblinear').fit(X_train, y_train)
print(features)
print("Coefficients: " + str(l1_logistic.coef_))
print("Y int: " + str(l1_logistic.intercept_))

['isUnstruct', 'Vkbat', '# homologues', 'HAS_H', 'HAS_S', 'HAS_O', 'HAS_U']
Coefficients: [[ 0.00000000e+00 -5.51824694e-02  2.95727550e-03  1.04361250e+01
   1.05559724e+01  1.15085529e+01  1.12956205e+01]]
Y int: [-16.8145868]


In [32]:
l1_preds = logistic_model.predict(X_test)
l1_prob_preds = logistic_model.predict_proba(X_test)
print(l1_preds)
print(y_test)
print((l1_preds - y_test).sum())
print(l1_logistic.score(X_test, y_test))
                                    

[1 0 1 ... 0 0 1]
4514    1
1843    0
383     1
304     0
4228    0
       ..
488     0
3702    0
3694    0
3730    0
4352    1
Name: is_switch, Length: 1259, dtype: int64
0
1.0


# Lessened Features Logistic Model 

In [33]:
has_X_features = ['HAS_H', 'HAS_S', 'HAS_O', 'HAS_U']
has_logistic_model = LogisticRegression().fit(train_data[has_X_features], y_train)
print(has_X_features)
print("Coefficients " + str(has_logistic_model.coef_))
print("Intercepts " + str(has_logistic_model.intercept_))
print("Score ")
has_logistic_model.score(test_data[has_X_features], y_test)

['HAS_H', 'HAS_S', 'HAS_O', 'HAS_U']
Coefficients [[7.07280552 7.12430609 8.06458427 7.88434781]]
Intercepts [-11.89568281]
Score 


1.0

# Test ROC curves

In [34]:
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import make_classification

In [40]:
print(l1_prob_preds)

[[0.01678671 0.98321329]
 [0.98154929 0.01845071]
 [0.04138236 0.95861764]
 ...
 [0.99298866 0.00701134]
 [0.98507771 0.01492229]
 [0.04334624 0.95665376]]


In [42]:
fpr, tpr, thresholds = roc_curve(y_test, l1_preds)

# need probability scores for this step

fig_hist = px.histogram(
    x=l1_preds, color=y_test, nbins=50,
    labels=dict(color='True Labels', x='Score')
)
fig_hist.show()