In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# the following dataset emulates the joint information from a companies HR file and medical exam
# our goal is to try to approximate salaries from this information
# we have chosen to use a KNN regression
salary = pd.read_csv('salaries.csv')
salary.head()

In [None]:
salary.shape

# Formal transformations

In [None]:
salary['Daltonic'].value_counts()

In [None]:
# one hot encoding
# you can also use sklearn.preprocessing.OneHotEncoder
#since we only accept numeric values, we have to turn the categorical variables into numerics
pd.get_dummies(salary['Daltonic'],prefix='Dalt',drop_first=True).head()

In [None]:
salary_transformed = pd.merge(left = salary,
                              right = pd.get_dummies(salary['Daltonic'],prefix='Daltonic',drop_first=True),
                              left_index=True,
                              right_index=True)
salary_transformed

In [None]:
# Binning: To turn numeric features into categorical ones. In this case we're not going to use categorical features, but for the record:
pd.cut(salary['Height'],5,labels=['very short','short','average','tall','very tall'])

In [None]:
salary_transformed['Height_classes'] = pd.cut(salary['Height'],5,labels=['very short','short','average','tall','very tall'])
salary_transformed.head()

In [None]:
# label encoding - for when the categories need to be numbers but the values of those numbers do not matter
# you can also use sklearn.preprocessing.LabelEncoder
salary_transformed['Experience_label'] = salary_transformed['Experience'].replace({'Junior':0,'Senior':1})
salary_transformed['Gender_label'] = salary_transformed['Gender'].replace({'Male':0,'Female':1})
salary_transformed.head()

In [None]:
# we can now drop the non-numerics and keep only numeric columns
salary_transformed = salary_transformed.drop(columns=['Experience','Gender','Daltonic','Height_classes'])
salary_transformed.head()

In [None]:
# now we will create the usual train-test splits

from sklearn.model_selection import train_test_split

X_train, X_test,y_train, y_test = train_test_split(salary_transformed.drop(columns = ['Salary']),salary_transformed['Salary'])

print(X_train.shape)
print(X_test.shape)
X_train.head()

In [None]:
X_test.head()

In [None]:
# let's jump right to a model
# notice the KNN regressor version
from sklearn.neighbors import KNeighborsRegressor
# measuring MSE score <- mean square error (real salary - predicted salary)**2


# create knn, don't forget Hyperparameter
knn = KNeighborsRegressor(n_neighbors=3)

In [None]:
# training the model on raw data
knn.fit(X_train, y_train)
# testing algorithm on raw test
pred = knn.predict(X_test)
pred

In [None]:
np.array(y_test)

In [None]:
np.sqrt(mean_squared_error(y_test,pred))

# Normalization

In [None]:
#let's apply a normalization of the features since "flexibility" seems to count 200 times more than Daltonic_None
from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import StandardScaler

# create normalization object from scikit learn package, and "fit" it to the features in hand
normalizer = MinMaxScaler()
# notice how we only use the **X_train** data to fit?
# we want to use only the training data to normalize (establishing maximum and minimum values) to avoid data "leakage" from the test dataset
# if we used data from the test dataset, the test results would be biased by having some info from the test set
normalizer = normalizer.fit(X_train)

In [None]:
# now that we have our normalizer we use it for both training and testing (and in the future for unseen data as well!)
X_train_normalized = normalizer.transform(X_train)
X_train_normalized = pd.DataFrame(X_train_normalized,columns=X_train.columns)
X_train_normalized.head()

In [None]:
X_test_normalized = normalizer.transform(X_test)
X_test_normalized = pd.DataFrame(X_test_normalized,columns=X_test.columns)
X_test_normalized.head()

In [None]:
# let's see if this normalization improves our model
# creating model
knn = KNeighborsRegressor(n_neighbors=3)
# training the model on normalized data
knn.fit(X_train_normalized, y_train)
# testing algorithm on normalized test
pred = knn.predict(X_test_normalized)

np.sqrt(mean_squared_error(y_test,pred))
#much better!

# Correlation Tresholds

In [None]:
# let's see if our variables are too dependent
pyplot as plt

In [None]:
#A very common way to visualize the results discussed above is to create a correlation matrix.
# This is shown below.
# Only the lower triangular component of the matrix is shown due to the fact that
# the upper and lower (triangular) parts of the matrix are equal
import matplotlib.pyplot as plt
import seaborn as sn

corr=np.abs(X_train_normalized.corr())

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(14, 14))
# Generate a custom diverging colormap
cmap = sn.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sn.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()

In [None]:
X_train_normalized.head()

In [None]:
# very clear that all variables are essentially the same! Except for experience!
# what is the effect of this?

#let's get rid of such variables
X_train_reduced = X_train_normalized[['Gender_label','Experience_label']]
X_test_reduced = X_test_normalized[['Gender_label','Experience_label']]

In [None]:
# creating our knn model
knn = KNeighborsRegressor(n_neighbors=3)
# training the model on reduced, normalized data
knn.fit(X_train_reduced, y_train)
# testing algorithm on reduced, normalized test
pred = knn.predict(X_test_reduced)

np.sqrt(mean_squared_error(y_test,pred))

# Feature manipulation for signal boosting

In [None]:
# we want to understand what drives loss of energy in our windfarms
energy = pd.read_csv('energy_loss.csv')
energy.head()

In [None]:
# let's try to predict it "raw"
X = energy[['Voltage','Rotation','Stability']]
y = energy['Loss']

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X, y)
reg.score(X, y)

In [None]:
# however we know the optimal values of Voltage, Rotation and Stability from an engineer
energy_transformed = energy.copy()
energy_transformed['Voltage'] = np.square(energy_transformed['Voltage']-100)
energy_transformed['Rotation'] = np.square(energy_transformed['Rotation']-150)
energy_transformed['Stability'] = np.square(energy_transformed['Stability']-90)
X = energy_transformed[['Voltage','Rotation','Stability']]
y = energy_transformed['Loss']

In [None]:
X

In [None]:
# the model improves dramatically
import numpy as np
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X, y)
reg.score(X, y)