<a href="https://colab.research.google.com/github/RafaelCaballero/Julio24/blob/main/code/27SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introducción a la ciencia de datos con Python
Rafa Caballero

## Support Vector Machines

Cargamos y preparamos los datos

In [None]:
import statistics as s
import numpy as np
import pandas as pd

size = 200
n = s.NormalDist(mu=20, sigma=0.16) # generador de números siguiendo una normal N(1.76,0.16)
a1 = n.samples(size,seed=3)
n2 = s.NormalDist(mu=40, sigma=0.25) # generador de números siguiendo una normal N(1.76,0.16)
a2 = n.samples(size,seed=4)
a1.extend(a2)
a = np.array(a1).round(2)
n2 = s.NormalDist(mu=25, sigma=3.2) # generador de números siguiendo una normal N(40,3.2)
b1 = n2.samples(size,seed=5)
n2 = s.NormalDist(mu=50, sigma=3.2) # generador de números siguiendo una normal N(40,3.2)
b2 = n2.samples(size,seed=6)
b1.extend(b2)
b = np.array(b1).round(2)

df = pd.DataFrame({"a":a*0.8+b*0.02,"b":b*0.5+3})
df["c"] = 1
df.loc[(df.b*0.8 + df.a>16*2+2),"c"] = 0

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

df0 = df[df.c==0]
df1 = df[df.c==1]

plt.scatter(df0.a,df0.b,color="b",s=3)
plt.scatter(df1.a,df1.b,color="r",s=3)
plt.show()

In [None]:
from sklearn.svm import SVC

XColumns = ["a","b"]
yColumn = ["c"]
X = df[XColumns]
y = df[yColumn]

metodo = SVC(kernel='linear', C=1E10)
model = metodo.fit(X, y)

model.support_vectors_

In [None]:
plt.scatter(df0.a,df0.b,color="b",s=3)
plt.scatter(df1.a,df1.b,color="r",s=3)
plt.scatter(model.support_vectors_[:,0],model.support_vectors_[:,1],color="green")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(7, 5))
ax.scatter(df0.a,df0.b,color="b",s=3)
ax.scatter(df1.a,df1.b,color="r",s=3)
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = model.decision_function(xy).reshape(XX.shape)
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
           linestyles=['--', '-', '--'])

ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=100,
           linewidth=1, facecolors='none', edgecolors='k')
plt.show()

Una propiedad muy interesante de SVM es que realmente solo interesan los puntos cercanos a la línea de separación, el resto no vana a cambiar nada. Esto le hace más independiente a outlayers

In [None]:
df2 = df[(df.a>16.6) & (df.b>16.1)]
df0 = df2[df2.c==0]
df1 = df2[df2.c==1]

XColumns = ["a","b"]
yColumn = ["c"]
X = df2[XColumns]
y = df2[yColumn]

metodo = SVC(kernel='linear', C=1E10)
model = metodo.fit(X, y)

model.support_vectors_

fig, ax = plt.subplots(figsize=(7, 5))
ax.scatter(df0.a,df0.b,color="b",s=3)
ax.scatter(df1.a,df1.b,color="r",s=3)
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = model.decision_function(xy).reshape(XX.shape)
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
           linestyles=['--', '-', '--'])

ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=100,
           linewidth=1, facecolors='none', edgecolors='k')
plt.show()

Vamos ahora a probar con datos no linealmente separables

In [None]:
from sklearn.datasets import make_circles
X, y = make_circles(100, factor=.1, noise=.1)

df= pd.DataFrame(X,columns=["a","b"])
df["c"] = y
df0 = df[df.c==0]
df1 = df[df.c==1]

fig, ax = plt.subplots(figsize=(7, 5))
ax.scatter(df0.a,df0.b,color="b",s=3)
ax.scatter(df1.a,df1.b,color="r",s=3)

#plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plt.show()

In [None]:
model=SVC(kernel='linear').fit(X, y)

fig, ax = plt.subplots(figsize=(7, 5))
ax.scatter(df0.a,df0.b,color="b",s=3)
ax.scatter(df1.a,df1.b,color="r",s=3)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = model.decision_function(xy).reshape(XX.shape)# plot decision boundary and margins
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
           linestyles=['--', '-', '--'])
# plot support vectors
ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=100,
           linewidth=1, facecolors='none', edgecolors='k')
plt.show()

In [None]:
from mpl_toolkits.mplot3d import axes3d
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(df0.a,df0.b,df0.c,color="b",s=3)
ax.scatter(df1.a,df1.b,df1.c,color="r",s=3)
#ax.view_init(14, 5, 1)
plt.show()

El "Kernel Trick"

In [None]:
model=SVC(kernel='rbf',probability=True).fit(X, y)
fig, ax = plt.subplots(figsize=(7, 5))
ax.scatter(df0.a,df0.b,color="b",s=3,label="0")
ax.scatter(df1.a,df1.b,color="r",s=3,label="1")

xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = model.decision_function(xy).reshape(XX.shape)# plot decision boundary and margins
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
           linestyles=['--', '-', '--'])
# plot support vectors
ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=100,
           linewidth=1, facecolors='none', edgecolors='k')
plt.legend()
plt.show()

In [None]:
model.predict_proba([[0,0],[-1,1], [-0.4,0]])