In [None]:
import seaborn as sns
import pandas as pd
import json
import os.path
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import MinMaxScaler

from Definitions import get_datafolder
basefolder = get_datafolder()


In [None]:
parameterfile = 'ProteinData_ttx_1hr_2/Analysis_dataWindow_3/dataWindow_3_parameters';
#    parameterfile = 'MikeData/Analysis_dataWindow_1/dataWindow_1_parameters';    


with open(basefolder+parameterfile+'.json') as f:
    parameters = json.load(f)


save_name = basefolder + parameters['save_name']

In [None]:
df = pd.read_csv(save_name+'clusterSizes_all.txt');
#df.loc[df['similarityScore']>100,'type']='incell';
df['type'] = df['type'].astype('category');

# Visualize Data

In [None]:

sns.scatterplot(data=df,x='clusterSize',y='similarityScore',hue='type');

#table = pd.crosstab(df.clusterSize,df.type)
#table.div(table.sum(1).astype(float),axis=0).plot()

In [None]:
X = np.asarray(df[['clusterSize','similarityScore']]);
Y = np.asarray(df.type.cat.codes);
#X_new = X[:,0]*X[:,1]
#X = np.column_stack((X,X_new[:, None]))

In [None]:
# define min max scaler and transform
X =  MinMaxScaler().fit_transform(X)

#  Regression

In [None]:
if True:
    model = LogisticRegression(C=1e2,verbose=True);# class_weight={0:0,1:1}
    #logreg = Perceptron()
    model.fit(X, Y);
    
else:
    model = LinearDiscriminantAnalysis()
    model.fit(X, Y)

# Evaluate Classifier

In [None]:
model.predict(X)

In [None]:
np.sum(model.predict(X)==Y)/len(Y)

In [None]:
model.score(X,Y)

In [None]:
#x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
#y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

x_min, x_max = X[:, 0].min(), X[:, 0].max()
y_min, y_max = X[:, 1].min(), X[:, 1].max()

h = 0.01  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
#Z = model.predict(np.c_[xx.ravel(), yy.ravel(),xx.ravel()*yy.ravel()])
#Z = Z[:,0]
# Put the result into a color plot
Z = Z.reshape(xx.shape)

In [None]:
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z,cmap=plt.cm.Paired)

## Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y,edgecolors='k', cmap=plt.cm.Paired)

In [None]:
# generate a no skill prediction (majority class)
ns_probs = [1 for _ in range(len(Y))]
# predict probabilities
lr_probs = model.predict_proba(X)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(Y, ns_probs)
lr_auc = roc_auc_score(Y, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(Y, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(Y, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

y_pred = model.predict(X)
cm = confusion_matrix(Y, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot()

# Select region manually

In [None]:
def f1(beta):
    beta0,beta1 = beta[0],beta[1];
    
    Z = beta0 + beta1*X[:,0] - X[:,1];
    
    mark = (Z < 0);
    
    if(np.sum(mark)==0):
        return 0;
    else:
        return -np.sum(Y[mark])/np.sum(mark);

In [None]:
f1([0.2,0.2,0.2])

In [None]:
#f1([0.2,1.2,0.2])
f1([1,-1.2])

In [None]:
from scipy.optimize import minimize, rosen, rosen_der

In [None]:
res = minimize(f1, [1,-1.4],tol=1e-8);

In [None]:
res

In [None]:
def f(x):   # The rosenbrock function
    return .5*(1 - x[0])**2 + (x[1] - x[0]**2)**2
minimize(f, [2, -1], method="CG")    

# Iris Dataset

In [None]:
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
Y = iris.target

# Create an instance of Logistic Regression Classifier and fit the data.
logreg = LogisticRegression(C=1e5)
logreg.fit(X, Y)

In [None]:
sns.scatterplot(x=X[:,0],y=X[:,1],hue=Y)

In [None]:
logreg.coef_

In [None]:
b = logreg.intercept_[0]
w1, w2 = logreg.coef_[0]

c = -b/w2
m = -w1/w2

# Plot the data and the classification with the decision boundary.
xmin, xmax = 4.5,8
ymin, ymax = 1.6,4.5
xd = np.array([xmin, xmax])
yd = m*xd + c
plt.plot(xd, yd, 'k', lw=1, ls='--')
plt.fill_between(xd, yd, ymin, color='tab:blue', alpha=0.2)
plt.fill_between(xd, yd, ymax, color='tab:orange', alpha=0.2)

In [None]:
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = .02  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

In [None]:
xx

In [None]:
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y,edgecolors='k', cmap=plt.cm.Paired)

In [None]:
logreg.get_params()

In [None]:
xx = np.linspace(0, 1500, 100)
yy = np.linspace(0,200, 100).T
xx, yy = np.meshgrid(xx, yy)
Xfull = np.c_[xx.ravel(), yy.ravel()]

In [None]:
yy.ravel()

In [None]:
probas = logreg.predict_proba(Xfull)

In [None]:
plt.imshow(Xfull,probas)

In [None]:
plt.pcolormesh(xx.ravel(), yy.ravel(), probas[:,0])