# Feature Engineering

datasets:

https://www.openml.org/search?type=data&status=active

In [1]:
# General imports
%matplotlib inline
from preamble import *
plt.rcParams['savefig.dpi'] = 100 # This controls the size of your figures
from matplotlib import cm
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
plt.style.use("ggplot")

In [None]:
# Download NO2 data. Takes a while the first time.
credit = oml.datasets.get_dataset(31)
X, y, _, _ = credit.get_data(target=no2.default_target_attribute); 
attribute_names = list(X)

In [None]:
X

In [None]:
y

In [None]:
df = X[['credit_history', 'purpose', 'credit_amount', 'savings_status',
        'existing_credits', 'job', 'own_telephone','foreign_worker']]

## Quick Exploration


In [None]:
df.head()

## Exercise 1 - Support Distance Algorithms


### Exercise 1.1

Convert all categorical features to support one-hot-vector encoding. 

### Exercise 1.2

Normalize all features to the closed range zero to one.

### Exercise 1.3

1. Run K-Means and use the elbow method to estimate the number of clusters. 
2. Create the clusters and plot their analysis.

![K_Means_Elbow_Method](https://raw.githubusercontent.com/satishgunjal/images/master/K_Means_Elbow_Method.png)


In [None]:
def get_kmeans_accuracy(data, top_k):
    sum_squared = []
    silhouette = []
    K = range(2, top_k)
    for i in K:
        kmeans = KMeans(n_clusters = i, init = 'k-means++')
        kmeans.fit(data)
        sum_squared.append(kmeans.inertia_)
        silhouette.append(silhouette_score(data, kmeans.labels_))
    return pd.DataFrame({
    "K": K,
    "SSE": sum_squared,
    "SIL": silhouette
  })

## Exercise 2 - Support Non-Metric Algorithms


### Exercise 2.1

Encode variables using LabelEncoder and use 25% of training set and evaulate the results (present the confusion matrix)


In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.25)

### Exercise 2.2

Use the target variable and create Decision Tree classifier.

In [None]:
tree = DecisionTreeClassifier()
tree.fit(x_train,y_train)

### Exercise 2.3

Write 2 conclusions for the confusion matrix.

In [None]:
y_pred = tree.predict(x_test)
metrics.confusion_matrix(y_test,y_pred)