# Tutorial Youtube

https://www.youtube.com/watch?v=84gqSbLcBFE&list=PLOU2XLYxmsIIuiBfYad6rFYQU_jL2ryal&index=4

## Video 4: Pipelines 

In [32]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import random

In [2]:
iris = load_iris()

In [3]:
# Create a data frame object so I can work more easily
df = pd.DataFrame({
    iris.feature_names[0]: iris.data[:,0],
    iris.feature_names[1]: iris.data[:,1],
    iris.feature_names[2]: iris.data[:,2],
    iris.feature_names[3]: iris.data[:,3],
    'label': iris.target
})
df.head()
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
label                150 non-null int32
petal length (cm)    150 non-null float64
petal width (cm)     150 non-null float64
sepal length (cm)    150 non-null float64
sepal width (cm)     150 non-null float64
dtypes: float64(4), int32(1)
memory usage: 5.3 KB


In [4]:
# Now create data and labels 
X = iris.data # data
y = iris.target # labels 

In [14]:
# Partition our dataset in training / testing 
# from sklearn.cross_validation import train_test_split 
# This is the old version, new sklearn version puts the function in another module, i.e. 
# sklearn.model_selection
from sklearn.model_selection import train_test_split
# Use the latter instead of the code above

In [15]:
# Use 20% of the data again for testing  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [16]:
# Now try 2 different classifiers

# DT
from sklearn.tree import DecisionTreeClassifier
clf_DT = DecisionTreeClassifier()
clf_DT.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [17]:
# Now evaluate how good the classification result is

# We compare the predicted labels to the true labels 
from sklearn.metrics import accuracy_score

predictions_DT = clf_DT.predict(X_test)
accuracy_score(y_test, predictions_DT)

1.0

In [18]:
# Now we try another classifier 
from sklearn.neighbors import KNeighborsClassifier

In [19]:
clf_KN = KNeighborsClassifier()
clf_KN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [20]:
predictions_KN = clf_KN.predict(X_test)
accuracy_score(y_test, predictions_KN)

1.0

### Visualise the classification later for my dissertation

Note, that the method only work when we look at 2 attributes only. We cannot plot it in higher dimensions  

# This is purely for visualisation, we dont really need that now but maybe for dissertation 
import mlxtend.plotting as mlx
mlx.plot_decision_regions(X_train, y_train, clf=clf_DT)
plt.title('Visualise classifier in scikit-learn')
plt.show()

# Uninstall the package later:
# https://conda.io/docs/commands/conda-uninstall.html 

# Video 5: Writing our first classifier
Now implement our own classifier
scrappy KNN he calls it 

Each classifier needs 2 things really: a fit-method and a predict-method
We just randomly assign a label to each instance, so it should be around 33% accuracy, but it will change every single time as the numbers are assigned randomly 

In [27]:
class ScrappyKNN():
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train 
    
    def predict(self, X_test):
        predictions = []
        
        for row in X_test:
            label = random.choice(self.y_train)
            predictions.append(label)
            
        
        return predictions 
    

In [38]:
my_Classifier = ScrappyKNN()
my_Classifier.fit(X_train, y_train)

In [39]:
predictions_myCL = my_Classifier.predict(X_test)
accuracy_score(y_test, predictions_myCL)

0.26666666666666666