<a href="https://colab.research.google.com/github/TobiasM95/IntroDataScience/blob/main/IntroToDataScience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Prologue

##What is Data Science

###- Working with data
###- Where does data come from
###- Quantification of events
###- Where can you find data science
###- What forms of data science exist

##Components of this notebook

###- Introduction to the technology used here
###- Load and transforming of data
###- Visualizing of data
###- Generating insights through data
###- Modelling of data

#Iris plants Dataset
##What does the dataset contain
###- Classification of 150 plants based on petal and sepal sizes
###- Small toy dataset perfectly suited for a data science introduction
##What is the goal of the analysis
###- Getting a feel for the data
###- Trying out simple data science techniques
###- Small glimpse into advanced machine learning techniques

In [None]:
!pip install numpy pandas matplotlib seaborn scikit-learn
#!pip install torch torchvision torchaudio

In [None]:
%load_ext google.colab.data_table
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_iris

from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import torch
import torch.nn as nn

In [None]:
dataset = load_iris()
dataset

In [None]:
print(dataset.keys())
print(len(dataset.target))
print(dataset.DESCR)

In [None]:
iris_dataframe = pd.DataFrame(
    data=dataset.data,
    columns=["sepal length [cm]", "sepal width [cm]", "petal length [cm]", "petal width [cm]"]
)
iris_dataframe["species id"] = dataset.target
iris_dataframe["species"] = pd.Series(dataset.target).apply(lambda x: dataset.target_names[x])
iris_dataframe

In [None]:
iris_dataframe.describe()

In [None]:
iris_dataframe.boxplot(column=list(iris_dataframe.columns)[:4], figsize=(17,14))

In [None]:
iris_dataframe.boxplot(column=list(iris_dataframe.columns)[:4], by="species", figsize=(17,14))

In [None]:
sns.boxplot(x="species", y="petal length [cm]", data=iris_dataframe)

##Interlude:
Analysis with and without known type (and in general with partial type or uncertain type, e.g. BKS with wrong types)

In [None]:
sns.FacetGrid(iris_dataframe, hue="species", height=6).map(plt.scatter, "sepal length [cm]", "sepal width [cm]").add_legend()

In [None]:
sns.pairplot(iris_dataframe.drop("species id", axis=1), hue="species", height=3)

In [None]:
#sklearn functions for this exist but we do it manually with dfs to emulate real world
x = iris_dataframe.iloc[:,0:4].to_numpy()
y = iris_dataframe["species id"].to_numpy()

#logreg = LogisticRegression(random_state=0).fit(iris_dataframe.iloc[:,0:4], iris_dataframe["species id"])
logreg = LogisticRegression(random_state=0, max_iter=1000).fit(x, y)
prediction = logreg.predict(x)
print(prediction)
print((iris_dataframe["species id"] != prediction).sum())
print(f"{np.around(100*(1.0 - 4.0/150),1)}%")

In [None]:
sepal_width = 3
sepal_length = 5.6
x_min, x_max = x[:, 2].min() - 0.5, x[:, 2].max() + 0.5
y_min, y_max = x[:, 3].min() - 0.5, x[:, 3].max() + 0.5
h = 0.02 
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
sample_size = xx.ravel().shape[0]
synthetic_data = np.hstack((sepal_length*np.ones((sample_size,1)), sepal_width*np.ones((sample_size,1)), xx.reshape((-1,1)), yy.reshape((-1,1))))
Z = logreg.predict(synthetic_data)

Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(10, 8))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

plt.scatter(x[:, 2], x[:, 3], c=y, edgecolors="k", cmap=plt.cm.Paired, s=70)
plt.xlabel("Petal length [cm]")
plt.ylabel("Petal width [cm]")

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
tsne = TSNE(n_components=2, n_iter=1000, random_state=0)
tsne_fit = tsne.fit_transform(iris_dataframe.iloc[:, 0:4].to_numpy())
plt.figure(1, figsize=(10,8))
plt.scatter(tsne_fit[:, 0], tsne_fit[:, 1], c=y, edgecolors="k", s=70)
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")

In [None]:
pca = PCA(n_components=2)
pca_fit = pca.fit_transform(iris_dataframe.iloc[:,0:4])
plt.figure(1, figsize=(10,8))
plt.scatter(pca_fit[:, 0], pca_fit[:, 1], c=y, edgecolors="k", s=70)
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")

In [None]:
x_min, x_max = pca_fit[:, 0].min() - 0.5, pca_fit[:, 0].max() + 0.5
y_min, y_max = pca_fit[:, 1].min() - 0.5, pca_fit[:, 1].max() + 0.5
h = 0.02 
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
sample_size = xx.ravel().shape[0]
synthetic_data = np.hstack((xx.reshape((-1,1)), yy.reshape((-1,1))))
logreg = LogisticRegression(random_state=0, max_iter=1000).fit(pca_fit, y)
Z = logreg.predict(synthetic_data)
#Worse result?
print(f"{np.around(100*(1.0-(logreg.predict(pca_fit) != y).sum()/150.0), 1)}%")

Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(10, 8))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

plt.scatter(pca_fit[:, 0], pca_fit[:, 1], c=y, edgecolors="k", cmap=plt.cm.Paired, s=70)
plt.xlabel("Petal length [cm]")
plt.ylabel("Petal width [cm]")

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
sepal_width = 3
sepal_length = 5.6
x_min, x_max = x[:, 2].min() - 0.5, x[:, 2].max() + 0.5
y_min, y_max = x[:, 3].min() - 0.5, x[:, 3].max() + 0.5
h = 0.02 
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
sample_size = xx.ravel().shape[0]
synthetic_data = np.hstack((sepal_length*np.ones((sample_size,1)), sepal_width*np.ones((sample_size,1)), xx.reshape((-1,1)), yy.reshape((-1,1))))
#Performance vs overfit
#rfclass = RandomForestClassifier(max_depth=2, random_state=0).fit(x, y)
rfclass = RandomForestClassifier(random_state=0).fit(x, y)
Z = rfclass.predict(synthetic_data)
print(f"{np.around(100*(1.0-(rfclass.predict(x) != y).sum()/150.0), 1)}%")

Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(10, 8))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

plt.scatter(x[:, 2], x[:, 3], c=y, edgecolors="k", cmap=plt.cm.Paired, s=70)
plt.xlabel("Petal length [cm]")
plt.ylabel("Petal width [cm]")

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

#why is the 1 brown point in wrong region?
plt.show()

In [None]:
class MLP(torch.nn.Module):
  def __init__(self, input_size, hidden_size, num_hidden_layers):
    super(MLP, self).__init__()
    self.num_hidden_layers = num_hidden_layers if num_hidden_layers > 2 else 2
    self.input_size = input_size
    self.hidden_size  = hidden_size
    self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
    self.fc2 = torch.nn.Linear(self.hidden_size, self.hidden_size)
    self.relu = torch.nn.ReLU()
    self.out = torch.nn.Linear(self.hidden_size, 3)
    self.sigmoid = torch.nn.Sigmoid()
  def forward(self, x):
    hidden = self.relu(self.fc1(x))
    for i in range(self.num_hidden_layers - 2):
      hidden = self.relu(self.fc2(hidden))
    output = self.out(hidden)
    output = self.sigmoid(output)
    return output

def train_model(model, optimizer, epochs, x, y, criterion):
  x_train = torch.tensor(x).to(torch.float)
  y_train = torch.tensor(y).to(torch.float)
  model.train()
  for epoch in range(epochs):
    optimizer.zero_grad()
    # Forward pass
    y_pred = model(x_train)
    # Compute Loss
    loss = criterion(y_pred, y_train)
    
    if epoch % (epochs//20) == 0:
      print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
    # Backward pass
    loss.backward()
    optimizer.step()

In [None]:
sepal_width = 3
sepal_length = 5.6
x_min, x_max = x[:, 2].min() - 0.5, x[:, 2].max() + 0.5
y_min, y_max = x[:, 3].min() - 0.5, x[:, 3].max() + 0.5
h = 0.02 
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
sample_size = xx.ravel().shape[0]
synthetic_data = np.hstack((sepal_length*np.ones((sample_size,1)), sepal_width*np.ones((sample_size,1)), xx.reshape((-1,1)), yy.reshape((-1,1))))

model = MLP(4, 50, 7)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y.reshape((-1,1)))
y_oh = enc.transform(y.reshape((-1,1))).toarray()
train_model(model, optimizer, 10000, x, y_oh, torch.nn.BCELoss())

Z_oh = model(torch.Tensor(synthetic_data).to(torch.float))
Z = enc.inverse_transform(Z_oh.detach())
x_pred_oh = model(torch.Tensor(x).to(torch.float))
x_pred = enc.inverse_transform(x_pred_oh.detach())
print(f"{np.around(100*(1.0-(x_pred.ravel() != y).sum()/150.0), 1)}%")

Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(10, 8))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

plt.scatter(x[:, 2], x[:, 3], c=y, edgecolors="k", cmap=plt.cm.Paired, s=70)
plt.xlabel("Petal length [cm]")
plt.ylabel("Petal width [cm]")

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
x_min, x_max = pca_fit[:, 0].min() - 0.5, pca_fit[:, 0].max() + 0.5
y_min, y_max = pca_fit[:, 1].min() - 0.5, pca_fit[:, 1].max() + 0.5
h = 0.02 
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
sample_size = xx.ravel().shape[0]
synthetic_data = np.hstack((xx.reshape((-1,1)), yy.reshape((-1,1))))

model = MLP(2, 50, 7)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y.reshape((-1,1)))
y_oh = enc.transform(y.reshape((-1,1))).toarray()
#print(y_oh)
train_model(model, optimizer, 10000, pca_fit, y_oh, torch.nn.BCELoss())

Z_oh = model(torch.Tensor(synthetic_data).to(torch.float))
#print(Z_oh)
Z = enc.inverse_transform(Z_oh.detach())
x_pred_oh = model(torch.Tensor(pca_fit).to(torch.float))
x_pred = enc.inverse_transform(x_pred_oh.detach())
#print(x_pred)
print(f"{np.around(100*(1.0-(x_pred.ravel() != y).sum()/150.0), 1)}%")

Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(10, 8))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

plt.scatter(pca_fit[:, 0], pca_fit[:, 1], c=y, edgecolors="k", cmap=plt.cm.Paired, s=70)
plt.xlabel("Petal length [cm]")
plt.ylabel("Petal width [cm]")

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

#why is the 1 brown point in wrong region?
plt.show()

In [None]:
class AutoEncoder(torch.nn.Module):
  def __init__(self, input_size, hidden_size, latent_size, num_hidden_layers_each):
    super(AutoEncoder, self).__init__()
    self.num_hidden_layers = num_hidden_layers_each if num_hidden_layers_each > 2 else 2
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.latent_size = latent_size
    self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
    self.fc2 = torch.nn.Linear(self.hidden_size, self.hidden_size)
    self.fc3 = torch.nn.Linear(self.hidden_size, self.latent_size)
    self.fc4 = torch.nn.Linear(self.latent_size, self.hidden_size)
    self.out = torch.nn.Linear(self.hidden_size, self.input_size)
    self.relu = torch.nn.ReLU()
    self.sigmoid = torch.nn.Sigmoid()
  def forward(self, x):
    hidden = self.relu(self.fc1(x))
    for i in range(self.num_hidden_layers - 2):
      hidden = self.relu(self.fc2(hidden))
    latent = self.relu(self.fc3(hidden))
    hidden = self.relu(self.fc4(latent))
    for i in range(self.num_hidden_layers - 2):
      hidden = self.relu(self.fc2(hidden))
    output = self.out(hidden)
    return latent, output

def train_model(model, optimizer, epochs, x, y, criterion, verbose = False):
  x_train = torch.tensor(x).to(torch.float)
  y_train = torch.tensor(y).to(torch.float)
  model.train()
  for epoch in range(epochs):
    optimizer.zero_grad()
    # Forward pass
    y_pred = model(x_train)
    # Compute Loss
    loss = criterion(y_pred[1], y_train)
    
    if epoch % (epochs//20) == 0:
      print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
      if verbose:
        print(np.around(np.hstack((y_pred[1].detach()[[30,60,90,120],], y_train[[30,60,90,120]])),1))
    # Backward pass
    loss.backward()
    optimizer.step()

def train_model_triplet(model, optimizer, epochs, x, y, criterion, t_lambda = 0.001, r_lambda = 1, verbose = False):
  x_train = torch.tensor(x).to(torch.float)
  y_train = torch.tensor(y).to(torch.float)
  model.train()
  for epoch in range(epochs):
    optimizer.zero_grad()
    # Forward pass
    y_pred = model(x_train)
    # Compute Loss
    loss = criterion(y_pred[1], y_train)

    random_sequence = torch.randint(0, 50, size=(50,))

    y_s0 = y_pred[0][random_sequence]
    y_s1 = y_pred[0][50+random_sequence]
    y_s2 = y_pred[0][100+random_sequence]
    
    t1_loss = -criterion(y_s1, y_s0)
    t2_loss = -criterion(y_s2, y_s0)
    t21_loss = -criterion(y_s2, y_s1)

    #alternative would be l1/l2 regularization of weights!
    r0_loss = criterion(y_s0, torch.normal(1, 2, y_s0.shape))
    r1_loss = criterion(y_s1, torch.normal(1, 2, y_s0.shape))
    r2_loss = criterion(y_s2, torch.normal(1, 2, y_s0.shape))

    full_loss = loss + t_lambda * (t1_loss + t2_loss + t21_loss) + r_lambda * (r0_loss + r1_loss + r2_loss)
    
    if epoch % (epochs//20) == 0:
      print('Epoch {}: l: {} t1l: {} t2l: {} t21l: {}'.format(epoch, loss.item(), t1_loss.item(), t2_loss.item(), t21_loss.item()))
      if verbose:
        print(np.around(np.hstack((y_pred[1].detach()[[30,60,90,120],], y_train[[30,60,90,120]], y_pred[0].detach()[[30,60,90,120]])),1))
    # Backward pass
    full_loss.backward()
    optimizer.step()

In [None]:
model = AutoEncoder(4, 50, 2, 14)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0005)
train_model(model, optimizer, 12500, x, x, torch.nn.MSELoss(), verbose=True)

latent, x_recon = model(torch.Tensor(x).to(torch.float))
latent, x_recon = latent.detach(), x_recon.detach()

plt.figure(1, figsize=(10, 8))

plt.scatter(latent[:, 0], latent[:, 1], c=y, edgecolors="k", cmap=plt.cm.Paired, s=70)
plt.xlabel("Latent variable 1")
plt.ylabel("Latent variable 2")

plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
model = AutoEncoder(4, 50, 2, 7)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001, weight_decay=0.01)
train_model_triplet(model, optimizer, 10000, x, x, torch.nn.MSELoss(), verbose=True)

latent, x_recon = model(torch.Tensor(x).to(torch.float))
latent, x_recon = latent.detach(), x_recon.detach()

plt.figure(1, figsize=(10, 8))

plt.scatter(latent[:, 0], latent[:, 1], c=y, edgecolors="k", cmap=plt.cm.Paired, s=70)
plt.xlabel("Latent variable 1")
plt.ylabel("Latent variable 2")

plt.xticks(())
plt.yticks(())

plt.show()