In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# the following dataset emulates the joint information from a companies HR file and medical exam
# our goal is to try to approximate salaries from this information
# we have chosen to use a KNN regression
salary = pd.read_csv('salaries.csv')
salary.head()

Unnamed: 0,Salary,Experience,Gender,Height,Weight,Flexibility,Strenght,BMI,BirthC,Daltonic,Estrogen
0,32064.17357,Senior,Female,160.18908,60.110733,230.467496,227.262099,16.748028,1,,74.619024
1,17160.86833,Junior,Female,153.100363,61.932449,260.080909,244.758004,17.10178,1,,74.581776
2,30940.2248,Senior,Female,166.949128,58.109921,237.047355,236.998125,17.085108,1,,85.510909
3,21036.86527,Junior,Male,173.952647,75.975594,151.578682,287.952815,21.061525,0,Dychromassy,22.091243
4,18855.00061,Junior,Male,180.282763,75.418624,152.50266,283.185533,21.134343,0,Dychromassy,21.749843


In [4]:
salary.shape

(200, 11)

# Formal transformations

In [7]:
salary['Gender'].value_counts()

Gender
Male      101
Female     99
Name: count, dtype: int64

In [8]:
salary['Daltonic'].value_counts(dropna=False)

Daltonic
NaN              99
Dychromassy      53
Monochromassy    48
Name: count, dtype: int64

## One hot encoding

In [17]:
# one hot encoding
# you can also use sklearn.preprocessing.OneHotEncoder
#since we only accept numeric values, we have to turn the categorical variables into numerics
pd.get_dummies(salary['Daltonic'],prefix='Dalt',drop_first=False).astype(int).head()

Unnamed: 0,Dalt_Dychromassy,Dalt_Monochromassy
0,0,0
1,0,0
2,0,0
3,1,0
4,1,0


In [18]:
salary_transformed = pd.merge(left = salary,
                              right = pd.get_dummies(salary['Daltonic'],prefix='Daltonic',drop_first=False).astype(int),
                              left_index=True,
                              right_index=True)
salary_transformed

Unnamed: 0,Salary,Experience,Gender,Height,Weight,Flexibility,Strenght,BMI,BirthC,Daltonic,Estrogen,Daltonic_Dychromassy,Daltonic_Monochromassy
0,32064.17357,Senior,Female,160.189080,60.110733,230.467496,227.262099,16.748028,1,,74.619024,0,0
1,17160.86833,Junior,Female,153.100363,61.932449,260.080909,244.758004,17.101780,1,,74.581776,0,0
2,30940.22480,Senior,Female,166.949128,58.109921,237.047355,236.998125,17.085108,1,,85.510909,0,0
3,21036.86527,Junior,Male,173.952647,75.975594,151.578682,287.952815,21.061525,0,Dychromassy,22.091243,1,0
4,18855.00061,Junior,Male,180.282763,75.418624,152.502660,283.185533,21.134343,0,Dychromassy,21.749843,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,29555.74937,Senior,Male,168.149552,73.913408,158.365212,282.779401,21.102963,0,Dychromassy,32.487848,1,0
196,28538.14361,Senior,Male,181.221799,74.308332,164.034265,293.964437,21.431592,0,Monochromassy,34.760106,0,1
197,20806.61816,Junior,Female,155.145032,60.678250,256.835440,237.695512,17.206375,1,,89.997758,0,0
198,32243.18934,Senior,Female,159.881301,56.090393,241.014490,241.293486,17.464618,1,,76.037265,0,0


In [19]:
salary_transformed.drop(columns=['Daltonic'], inplace = True)

## Binning

In [10]:
# Binning: To turn numeric features into categorical ones. Will do the division automatically
# #In this case we're not going to use categorical features, but for the record:
pd.cut(salary['Height'],5,labels=['very short','short','average','tall','very tall'])

0           short
1      very short
2         average
3            tall
4       very tall
          ...    
195       average
196     very tall
197    very short
198         short
199          tall
Name: Height, Length: 200, dtype: category
Categories (5, object): ['very short' < 'short' < 'average' < 'tall' < 'very tall']

In [21]:
salary_transformed['Height_classes'] = pd.cut(salary['Height'],5,labels=['very short','short','average','tall','very tall'])
salary_transformed.head()

Unnamed: 0,Salary,Experience,Gender,Height,Weight,Flexibility,Strenght,BMI,BirthC,Estrogen,Daltonic_Dychromassy,Daltonic_Monochromassy,Experience_label,Gender_label,Height_classes
0,32064.17357,Senior,Female,160.18908,60.110733,230.467496,227.262099,16.748028,1,74.619024,0,0,1,1,short
1,17160.86833,Junior,Female,153.100363,61.932449,260.080909,244.758004,17.10178,1,74.581776,0,0,0,1,very short
2,30940.2248,Senior,Female,166.949128,58.109921,237.047355,236.998125,17.085108,1,85.510909,0,0,1,1,average
3,21036.86527,Junior,Male,173.952647,75.975594,151.578682,287.952815,21.061525,0,22.091243,1,0,0,0,tall
4,18855.00061,Junior,Male,180.282763,75.418624,152.50266,283.185533,21.134343,0,21.749843,1,0,0,0,very tall


In [20]:
# label encoding - for when the categories need to be numbers but the values of those numbers do not matter
# you can also use sklearn.preprocessing.LabelEncoder
salary_transformed['Experience_label'] = salary_transformed['Experience'].replace({'Junior':0,'Senior':1})
salary_transformed['Gender_label'] = salary_transformed['Gender'].replace({'Male':0,'Female':1})
salary_transformed.head()

  salary_transformed['Experience_label'] = salary_transformed['Experience'].replace({'Junior':0,'Senior':1})
  salary_transformed['Gender_label'] = salary_transformed['Gender'].replace({'Male':0,'Female':1})


Unnamed: 0,Salary,Experience,Gender,Height,Weight,Flexibility,Strenght,BMI,BirthC,Estrogen,Daltonic_Dychromassy,Daltonic_Monochromassy,Experience_label,Gender_label
0,32064.17357,Senior,Female,160.18908,60.110733,230.467496,227.262099,16.748028,1,74.619024,0,0,1,1
1,17160.86833,Junior,Female,153.100363,61.932449,260.080909,244.758004,17.10178,1,74.581776,0,0,0,1
2,30940.2248,Senior,Female,166.949128,58.109921,237.047355,236.998125,17.085108,1,85.510909,0,0,1,1
3,21036.86527,Junior,Male,173.952647,75.975594,151.578682,287.952815,21.061525,0,22.091243,1,0,0,0
4,18855.00061,Junior,Male,180.282763,75.418624,152.50266,283.185533,21.134343,0,21.749843,1,0,0,0


In [22]:
# we can now drop the non-numerics and keep only numeric columns
salary_transformed = salary_transformed.drop(columns=['Experience','Gender','Height_classes'])
salary_transformed.head()

Unnamed: 0,Salary,Height,Weight,Flexibility,Strenght,BMI,BirthC,Estrogen,Daltonic_Dychromassy,Daltonic_Monochromassy,Experience_label,Gender_label
0,32064.17357,160.18908,60.110733,230.467496,227.262099,16.748028,1,74.619024,0,0,1,1
1,17160.86833,153.100363,61.932449,260.080909,244.758004,17.10178,1,74.581776,0,0,0,1
2,30940.2248,166.949128,58.109921,237.047355,236.998125,17.085108,1,85.510909,0,0,1,1
3,21036.86527,173.952647,75.975594,151.578682,287.952815,21.061525,0,22.091243,1,0,0,0
4,18855.00061,180.282763,75.418624,152.50266,283.185533,21.134343,0,21.749843,1,0,0,0


In [23]:
# now we will create the usual train-test splits

from sklearn.model_selection import train_test_split

X_train, X_test,y_train, y_test = train_test_split(salary_transformed.drop(columns = ['Salary']),salary_transformed['Salary'])

print(X_train.shape)
print(X_test.shape)
X_train.head()
#index data are not changing

(150, 11)
(50, 11)


Unnamed: 0,Height,Weight,Flexibility,Strenght,BMI,BirthC,Estrogen,Daltonic_Dychromassy,Daltonic_Monochromassy,Experience_label,Gender_label
183,172.430526,75.007343,155.430179,265.673852,20.898089,0,28.365903,1,0,0,0
54,172.283516,72.011378,157.079095,292.244297,20.985975,0,24.48408,1,0,1,0
48,160.85858,56.09761,245.863259,233.33897,16.660302,1,80.927002,0,0,0,1
139,167.270538,75.760002,154.243855,270.35952,20.834132,0,26.87087,1,0,1,0
134,164.776224,61.620543,239.173139,252.470971,16.92623,1,83.865309,0,0,0,1


In [None]:
X_test.head()

In [24]:
# let's jump right to a model
# notice the KNN regressor version
from sklearn.neighbors import KNeighborsRegressor
#regressor as it is a continuous variable
# measuring MSE score <- mean square error (real salary - predicted salary)**2


# create knn, don't forget Hyperparameter
knn = KNeighborsRegressor(n_neighbors=3)

In [25]:
# training the model on raw data
knn.fit(X_train, y_train)

In [32]:

# testing algorithm on raw test
#prediction of the model, what it will say using the x test
y_pred = knn.predict(X_test)
y_pred

array([28001.55531   , 30941.30447333, 23753.25085   , 29846.65816667,
       22360.22401333, 31607.84009   , 28737.27489   , 20822.84667   ,
       27035.85113667, 20229.86229667, 20635.74984   , 26907.05745667,
       27277.95440333, 30298.82825   , 20229.86229667, 28621.85794   ,
       28107.22634   , 23072.04375   , 30647.10643   , 23539.87103   ,
       25973.22583667, 23528.70051667, 26516.78837667, 19054.59337667,
       27168.34856   , 28001.55531   , 26310.15816667, 29964.68807667,
       22826.77272667, 22999.16740667, 22512.4716    , 26525.28267667,
       28820.45395   , 23983.64520333, 30552.26932   , 29408.37994667,
       27183.13575333, 28297.34504333, 29079.46016333, 18620.38654667,
       22782.28138   , 18590.25663   , 26907.05745667, 20229.86229667,
       26025.14619667, 16941.39060333, 22187.71334667, 29685.83456   ,
       30406.40908333, 30941.30447333])

In [27]:
np.array(y_test)

array([25851.75653, 33580.6887 , 18708.36302, 22615.35436, 20652.36898,
       30262.18034, 28999.9137 , 23235.3408 , 20527.40791, 17543.57212,
       20456.25856, 30734.33947, 31231.27885, 30437.04885, 19796.22265,
       21036.86527, 19782.57384, 33864.47268, 19480.3042 , 19662.41165,
       21046.63718, 19572.24367, 30324.69555, 19069.11798, 20898.45745,
       18283.91695, 19829.44956, 20626.32729, 32685.80772, 25917.96406,
       17591.55066, 14593.73237, 20218.31183, 31488.21095, 20189.43041,
       20908.84842, 31107.73466, 30316.12097, 14561.4704 , 25644.19311,
       16332.66201, 25514.38226, 34017.02702, 19075.60403, 26247.70358,
       28538.14361, 21096.74094, 21523.1314 , 29555.74937, 29912.08603])

In [33]:
#Let's compare y test and y train
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test,y_pred))

6461.9548855012445

On average, each salary predictions is missing by 6491

In [None]:
#we square it to amplify the result and avoid negative

In [34]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)
#linear regression giving the same result


-0.3667701644781518

#### For classification problems model.score calculate accuracy
#### For regression  problems model.score calculate r2_score (coefficient of determination)

We need to say that 1 point for gendre is way more important than 1 point in age or high.
We need to normalize the different columns to have have them at the same scale


# Normalization

In [36]:
#let's apply a normalization of the features since "flexibility" seems to count 200 times more than Daltonic_None
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# create normalization object from scikit learn package, and "fit" it to the features in hand
normalizer = MinMaxScaler()
# notice how we only use the **X_train** data to fit?
# we want to use only the training data to normalize (establishing maximum and minimum values) to avoid data "leakage" from the test dataset
# if we used data from the test dataset, the test results would be biased by having some info from the test set
normalizer = normalizer.fit(X_train)

In [37]:
normalizer

In [38]:
# now that we have our normalizer we use it for both training and testing (and in the future for unseen data as well!)
X_train_normalized = normalizer.transform(X_train)
X_train_normalized = pd.DataFrame(X_train_normalized,columns=X_train.columns)
X_train_normalized.head()

Unnamed: 0,Height,Weight,Flexibility,Strenght,BMI,BirthC,Estrogen,Daltonic_Dychromassy,Daltonic_Monochromassy,Experience_label,Gender_label
0,0.622535,0.824627,0.18582,0.522613,0.8221,0.0,0.127373,1.0,0.0,0.0,0.0
1,0.618913,0.701242,0.198243,0.81007,0.838383,0.0,0.073601,1.0,0.0,1.0,0.0
2,0.337422,0.045851,0.867146,0.172792,0.036952,1.0,0.855464,0.0,0.0,0.0,1.0
3,0.495402,0.855625,0.176882,0.573305,0.810251,0.0,0.106663,1.0,0.0,1.0,0.0
4,0.433946,0.273307,0.816742,0.379775,0.086221,1.0,0.896166,0.0,0.0,0.0,1.0


In [39]:
X_test_normalized = normalizer.transform(X_test)
X_test_normalized = pd.DataFrame(X_test_normalized,columns=X_test.columns)
X_test_normalized.head()

Unnamed: 0,Height,Weight,Flexibility,Strenght,BMI,BirthC,Estrogen,Daltonic_Dychromassy,Daltonic_Monochromassy,Experience_label,Gender_label
0,0.111088,0.195438,0.800207,-0.011121,0.156705,1.0,0.890928,0.0,0.0,1.0,1.0
1,0.599665,0.821836,0.26467,0.685262,0.853732,0.0,0.177061,1.0,0.0,1.0,0.0
2,0.761039,0.745971,0.147081,0.733369,0.90774,0.0,0.160033,1.0,0.0,0.0,0.0
3,0.384972,0.112653,0.745421,0.242568,-0.009926,1.0,0.98729,0.0,0.0,0.0,1.0
4,0.170528,0.165407,0.853426,0.25025,0.176632,1.0,0.903514,0.0,0.0,0.0,1.0


In [46]:
# let's see if this normalization improves our model
# creating model
knn_with_scaling = KNeighborsRegressor(n_neighbors=3)
# training the model on normalized data
knn_with_scaling.fit(X_train_normalized, y_train)
# testing algorithm on normalized test
y_pred_normalized = knn_with_scaling.predict(X_test_normalized)

np.sqrt(mean_squared_error(y_test,y_pred_normalized))
#much better!



2678.6427975510715

In [47]:
#metrics
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_normalized)

0.7651463828362596

In [48]:
knn_with_scaling.score(X_test_normalized, y_test)

0.7651463828362596

We are wrong on the salary by 2678 on avg
It's bettter than the 6000 previously
R2 passed from -0.14 to 0.76

# Correlation Tresholds

In [None]:
# let's see if our variables are too dependent
pyplot as plt

In [None]:
#A very common way to visualize the results discussed above is to create a correlation matrix.
# This is shown below.
# Only the lower triangular component of the matrix is shown due to the fact that
# the upper and lower (triangular) parts of the matrix are equal
import matplotlib.pyplot as plt
import seaborn as sn

corr=np.abs(X_train_normalized.corr())

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(14, 14))
# Generate a custom diverging colormap
cmap = sn.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sn.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()

In [None]:
X_train_normalized.head()

In [None]:
# very clear that all variables are essentially the same! Except for experience!
# what is the effect of this?

#let's get rid of such variables
X_train_reduced = X_train_normalized[['Gender_label','Experience_label']]
X_test_reduced = X_test_normalized[['Gender_label','Experience_label']]

In [None]:
# creating our knn model
knn = KNeighborsRegressor(n_neighbors=3)
# training the model on reduced, normalized data
knn.fit(X_train_reduced, y_train)
# testing algorithm on reduced, normalized test
pred = knn.predict(X_test_reduced)

np.sqrt(mean_squared_error(y_test,pred))

# Feature manipulation for signal boosting

In [None]:
# we want to understand what drives loss of energy in our windfarms
energy = pd.read_csv('energy_loss.csv')
energy.head()

In [None]:
# let's try to predict it "raw"
X = energy[['Voltage','Rotation','Stability']]
y = energy['Loss']

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X, y)
reg.score(X, y)

In [None]:
# however we know the optimal values of Voltage, Rotation and Stability from an engineer
energy_transformed = energy.copy()
energy_transformed['Voltage'] = np.square(energy_transformed['Voltage']-100)
energy_transformed['Rotation'] = np.square(energy_transformed['Rotation']-150)
energy_transformed['Stability'] = np.square(energy_transformed['Stability']-90)
X = energy_transformed[['Voltage','Rotation','Stability']]
y = energy_transformed['Loss']

In [None]:
X

In [None]:
# the model improves dramatically
import numpy as np
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X, y)
reg.score(X, y)