# Understanding the power of feature engineering
---
This is a practice of a tutorial about feature engineering and the code. I followed this tutorial from this Towards Data Science article. (https://towardsdatascience.com/the-power-of-feature-engineering-b6f3bb7de39c). I will be commenting the code about my understanding of things. 

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

n_points = 2000 #Number of employees / data points

age = np.round(np.linspace(18, 60, n_points), 2) #age of employees. Age is also in two decimal places
np.random.shuffle(age) #Shuffles ages so they are not correlated

performance = np.linspace(-10, 10, n_points) #performance of employees 
np.random.shuffle(performance)

noise = np.random.randn(n_points)

g = (100 * age) + 200*(performance) + 500 * age / performance - 10000 + 500 * noise
y = [1 if y >= 0 else 0 for y in g]

data = pd.DataFrame(data = {'age': age, 'performance': performance, 'y': y})
data.head()

In [None]:
plt.subplots(nrows = 1, ncols = 1, figsize = (15, 10))

plt.scatter('age', 'performance', c = '#ff2121', data = data[data.y == 1])
plt.scatter('age', 'performance', c = '#2176ff', data = data[data.y == 0])
plt.ylabel('Performance Score', size = 20)
plt.xlabel('Age', size = 20)
plt.title('Scatter Plot of Promotion Data', size = 20)
plt.xticks(size = 12)
plt.yticks(size = 12)
plt.legend(['Promoted', 'Not Promoted'], loc = 2, prop = {'size': 20})

In [None]:
 from sklearn.model_selection import train_test_split
 import sklearn.metrics as metric
 import statsmodels.api as sm

x = data[['age', 'performance']]
s = sm.add_constant(x)
y = data['y']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 101)

model = sm.Logit(y_train, x_train).fit() #Fit logistic regression

predictions = np.around(model.predict(x_test))
accuracy = metric.accuracy_score(y_test, predictions)

In [None]:
n_points = 1000000

age_db = np.linspace(18, 60, n_points)
np.random.shuffle(age_db)

performance_db = np.linspace(-10, 10, n_points)
np.random.shuffle(performance_db)

data_db = pd.DataFrame({'age': age_db, 'performance': performance_db})
data_db = sm.add_constant(data_db)

predictions = model.predict(data_db)
y_db = [round(p) for p in predictions]
data_db['y'] = y_db

fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (15, 10))

plt.scatter('age', 'performance', c = '#ffbdbd', s = 1, data = data_db[data_db.y == 1])
plt.scatter('age', 'performance', c = '#b0c4ff', s = 1, data = data_db[data_db.y == 0])

plt.scatter('age', 'performance', c = '#ff2121', data = data_db[data_db.y == 1])
plt.scatter('age', 'performance', c = '#2176ff', data = data_db[data_db.y == 0])
plt.title('Decision Boundary of Logistic Regression Model')
plt.xlabel('Age', size = 20)
plt.ylabel('Performance Score', size = 20)
plt.yticks(size = 12)
plt.xticks(size = 12)
plt.legend()