# Código TP4: Métodos Supervisados Avanzados

In [None]:
import numpy as np
import pandas as pd


from sklearn.datasets import load_iris, fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
from sklearn.naive_bayes import MultinomialNB

# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsClassifier.html
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

# https://xgboost.readthedocs.io/en/stable/python/python_api.html
from xgboost import XGBClassifier  # Tiene interfaz estilo sklearn

# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.ensemble import RandomForestClassifier

# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html
from sklearn.svm import SVC, SVR  # Clasificación / Regresión


## Carga de Datasets

News Groups Dataset: https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset

In [None]:
X, y = fetch_20newsgroups(subset="train",return_X_y=True, remove=("headers",))
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.25, random_state=1)
X_test, y_test = fetch_20newsgroups(subset="test",return_X_y=True,remove=("headers",))

print('Sample X: ', X_train[0])
print('Sample y: ', y_train[0])

Sample X:  |> In article <1993Apr5.220457.6800@spdc.ti.com> dwhite@epcot.spdc.ti.com (Dan White) writes:
|> 
|> >However, haven't we already lost our right to bear arms?
|> 
|> >	It seems that in most states, like Texas, a citizen may own a
|> >gun and carry while at his home or business.  But a citizen is severely
|> >restricted from bearing outside these areas.  Here in Texas you cannot
|> >carry in your car except when "traveling" which is usually defined as
|> >"traveling across a county line."  How did this come about?  Are there
|> >any court rulings on the legality of restricting the carrying of a
|> >weapon outside the home?  
|> 

In Texas, it is legal to carry handguns while "traveling", and also to and from
sporting activities.           ^^^^^^^^

Chapter 46 of the Texas State Penal Code does NOT restrict long guns.
Therefore, it is legal to carry and transport long guns any place in Texas.

Regards,
Clay


Sample y:  16


In [None]:
# spirals
esp_train = pd.read_csv("esp_train.csv", sep=',')
print(esp_train)
esp_test = pd.read_csv("esp_test.csv", sep=',')
print(esp_test)

# diagonals
diag_train = pd.read_csv("diag_train.csv", sep=',')
print(diag_train)
diag_test = pd.read_csv("diag_test.csv", sep=',')
print(diag_test)

# RRL
RRL = pd.read_csv("RRL.csv", sep=',')
print(RRL)

# Lampone
Lampone = pd.read_csv("Lampone.csv", sep=',')
print(Lampone)

            x         y  class
0    0.784605 -0.443353      0
1   -0.407629 -0.668835      0
2    0.417495 -0.720688      0
3    0.328802  0.253179      0
4   -0.051020 -0.680422      0
..        ...       ...    ...
195 -0.537658  0.625798      1
196  0.490364 -0.198179      1
197  0.533439 -0.309978      1
198  0.566625 -0.369130      0
199  0.807937 -0.524810      1

[200 rows x 3 columns]
             x         y  class
0    -0.637005  0.284939      0
1     0.133191 -0.835979      0
2     0.658488  0.528749      0
3    -0.392254  0.236327      0
4     0.121966  0.425345      0
...        ...       ...    ...
4995  0.023548 -0.954021      1
4996  0.142607 -0.434148      1
4997  0.104802  0.334145      1
4998  0.450283  0.629333      1
4999 -0.673389 -0.645916      1

[5000 rows x 3 columns]
           V1        V2  class
0   -0.631497 -0.738363      0
1   -0.430678  0.134716      0
2   -0.227673 -2.530069      0
3   -1.532579  0.066135      0
4   -0.675265 -0.358977      0
..       

## Ejercicio 1: Naïve-Bayes

Vectorizamos, contando las veces que ocurre cada palabra de un diccionario dado en cada texto

In [None]:
largo_diccionario=4000
vec = CountVectorizer(stop_words='english',max_features=largo_diccionario)
Xvec_train = vec.fit_transform(X_train).toarray()
Xvec_test = vec.transform(X_test).toarray()

print('Sample X: ', Xvec_train[0], ' | non-zero: ', Xvec_train[0][Xvec_train[0] != 0])
print('Sample y: ', y_train[0])

Sample X:  [0 0 0 ... 0 0 0]  | non-zero:  [1 1 1 1 1 1 1 1 1 4 1 1 2 1 2 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 2 1 2 1 1 1 1
 1 5 2 1 1 1 1]
Sample y:  16


Y lo clasificamos con el clasificador discreto multinomial (pag. 180 del libro de Mitchell)

In [None]:
nb_alfa=1
nb = MultinomialNB(
    alpha=nb_alfa
)

# COMPLETAR

## Ejercicio 2: K-NN

In [None]:
knn_neighbors = 5
knn = KNeighborsClassifier(
    n_neighbors=knn_neighbors
)

rnn_radius = 1.0
rnn = RadiusNeighborsClassifier(
    radius=rnn_radius
)

# COMPLETAR

## Ejercicio 3: Boosting

### Ejemplo de boosting con XGBoost sobre espirales

In [None]:
esp_train_np = esp_train.to_numpy()

xgb_nrounds = 500  # total de árboles por ciclo
xgb_eta = 0.1      # learning rate (similar a redes neuronales)
xgb_maxdepth = 3   # profundidad máx de los árboles == complejidad

X = esp_train_np[:, 0:2]
y = esp_train_np[:, 2]

xgb = XGBClassifier(
    objective='binary:hinge', n_estimators=xgb_nrounds,
    learning_rate=xgb_eta, max_depth=xgb_maxdepth, colsample_bytree=1, verbosity=0,
    subsample=1)
xgb.fit(X, y, verbose=0)

xgb.score(X,y)

1.0

In [None]:
# COMPLETAR

## Ejercicio 4: Random Forests

In [None]:
rf_estimators = 1000
rf_mtry = 0

rf = RandomForestClassifier(
    n_estimators=rf_estimators,
    max_features=rf_mtry,
    bootstrap=True,
    class_weight='balanced',
    oob_score=True
)

# 1 - random_forest.oob_score_ := final out-of-bag error

# COMPLETAR

## Ejercicio 5: SVM

In [None]:
svm_kernel = 'rbf'  # 'rbf' / 'poly'
svm_C = 1.0
svm_gamma = 0.001
svm_degree = 3  # poly only, ignored otherwise

svm = SVC(
    C=svm_C,
    kernel=svm_kernel,
    gamma=svm_gamma,
    degree=svm_degree
)

# COMPLETAR