# Using diferents machine learning models

# Classification

### Preparing the data

In [1]:
import pandas as pd
from sklearn.datasets import load_iris

url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'
iris = pd.read_csv(url)

# iris = load_iris()
# df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [2]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
print(iris.shape)
print(iris.dtypes)

(150, 5)
sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object


#### Note: In the class, the dataset was first transformed into an array, and then the attributes and classes were separated.
In this example I am using `.iloc`.

array = iris.values

X = array[:, 0:4]

y = array[:, 4]

In [4]:
# separating features and targets

# X = iris[["sepal_length", "sepal_width", "petal_length", "petal_width"]]
# y = iris["species"]

X = iris.iloc[:, 0:4]
y = iris.iloc[:, -1] # -1 is refaring to the last column, I could use iris.iloc[:, 4] as well
X.head(), y.head()

(   sepal_length  sepal_width  petal_length  petal_width
 0           5.1          3.5           1.4          0.2
 1           4.9          3.0           1.4          0.2
 2           4.7          3.2           1.3          0.2
 3           4.6          3.1           1.5          0.2
 4           5.0          3.6           1.4          0.2,
 0    setosa
 1    setosa
 2    setosa
 3    setosa
 4    setosa
 Name: species, dtype: object)

In [5]:
# separating training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Using KFold for cross validation

In [8]:
from sklearn.model_selection import KFold

num_particoes = 3
kfold = KFold(n_splits=num_particoes, shuffle=True, random_state=42)

### Listing models to find the better one

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import numpy as np

# random seed
np.random.seed(42)

# Listing the models, results and names

models = []
results = []
names = []

# adding models to the list

models.append(('KNN', KNeighborsClassifier()))
models.append(('DTree', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('LOGREG', LogisticRegression()))


In [12]:
models

[('KNN', KNeighborsClassifier()),
 ('DTree', DecisionTreeClassifier()),
 ('NB', GaussianNB()),
 ('SVM', SVC()),
 ('LOGREG', LogisticRegression())]

# Models Execution