In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso,LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC,SVR
from tqdm.notebook import tqdm as tqdm
from sklearn import decomposition
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
import seaborn as sns
from sklearn.manifold import TSNE, LocallyLinearEmbedding, MDS
from pandas.plotting import scatter_matrix
from sklearn import datasets
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
plt.style.use("bmh")



# ML


This dataset contains information on all 802 Pokemon from all Seven Generations of Pokemon. The information contained in this dataset include Base Stats, Performance against Other Types, Height, Weight, Classification, Egg Steps, Experience Points, Abilities, etc. The information was scraped from http://serebii.net/

<b>The main goal is to estimate the pokemon speed</b>

### Q.1 

Read the attached "pokemon.csv" file to a DataFrame.

### Q.2

Remove the "#" and "Name" columns.

### Q.3

Print the features name and type.

### Q.4 

Handle the missing values in the dataset.

### Q.5

#### 5.1
Perform feature encoding on the categorical features. <br/>
For each categorical feature, create LabelEncoder and transform its values.

#### 5.2

split to (X,y).

### Q.6

#### 6.1

Use RF and display the importance of each feature.

In [3]:
# RF = RandomForestRegressor(n_estimators=200, random_state=1)
# RF.fit(X,y)
# importances = pd.DataFrame({
#     "Feature": X.columns,
#     "Importance": RF.feature_importances_ * 100
# })
# importances = importances.sort_values("Importance", ascending=False)
# importances

#### 6.2

Remove features with importance less than 5%

### Q.7

#### 7.1

Implement the function  `search_for_hyperparameters(X, y, model_name, model, param_grid)`

That gets as input:
* X - dataset
* y - target variable
* model_name - string
* model - Object
* param_grid

and perform GridSearchCV


In [4]:
def search_for_hyperparameters(X, y, model_name, model, param_grid):
    pipe = Pipeline([(model_name, model)])
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
    grid.fit(X, y)
    print("Test set score: {:.2f}".format(grid.score(X, y))) # grade for training !!!!!!
    print("Best parameters: {}".format(grid.best_params_))

#### 7.2

Test your method on: AdaBoostRegressor, RandomForestRegressor, SVR, KNNR

* AdaBoostRegressor: learning_rate, range of [$10^{-4},10^{-3},10^{-2},10^{-1},10^{0}$]
* RandomForestRegressor: n_estimators, max_depth
* SVR: C (range as learning rate), gamma (range as learning rate)
* KNN: K [3,5,7,11,13], metric


### Q.7

Use LocalOutlierFactor (LOF) for anomaly detection. Plot the outliers ratio.

In [5]:
# clf = LocalOutlierFactor(n_neighbors=11)
# y_pred = clf.fit_predict(X)
# X['isOutlier'] = y_pred.astype(str)

### Q.8

Use t-SNE and plot the compressed latent space with respect to the outliers.

In [32]:
def plot_tSNE(data, labels, perplexity=[5,20,30,50]):
    kl_divergence_iris = []
    fig, ax = plt.subplots(1, 4, figsize=(40, 8))
    for idx, p in enumerate(perplexity):
        tsne = TSNE(n_components=2, random_state=0, perplexity=p)
        tsne_data = tsne.fit_transform(data)
        kl_divergence_iris.append(tsne.kl_divergence_)
        tsne_data = np.vstack((tsne_data.T, labels)).T
        df = pd.DataFrame(data=tsne_data, columns=("dim1", "dim2", "Label"))
        sns.scatterplot(x=df['dim1'], y=df['dim2'], hue=df["Label"], ax=ax[idx], s=100,palette='colorblind').set_title("t-SNE with perplexity="+str(p))
    plt.show()

### Q.9

Build recommendation mechanism to find if given observation is an outlier.

In [79]:
def predict_outlier_pokemon():
    to_predict = [
        # add features here
    ]
    predicted_value = model.predict([to_predict])[0]
    # get decision

In [6]:
# predict_outlier_pokemon(model, 500, 100, 50, 70, 70)

In [7]:
# predict_outlier_pokemon(model, 200, 65, 40, 15,80)