In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file = os.path.join(dirname, filename) 
        print(file)

In [None]:
# Read from CSV
df = pd.read_csv(file)
df.head()

# Create total pay  
df['TotalPay'] = df['BasePay'] + df['Bonus']

# Change Gender and Dept to numerical (Dummy Coding)
gender_dummies = pd.get_dummies(df['Gender'])
dept_dummies = pd.get_dummies(df['Dept'])
df = df.join(gender_dummies)
del df['Gender']
del df['Male']  # We can delete this one because all the information is in Female (1 = Female, 0 = Male)

# Switch from Education(College, High School, ...) to just College (or more) yes/no 
df['Education'] = np.where(df['Education'].isin(['College', 'Masters', 'PhD']), 1, 0)

df.head()

## Simple Linear Regression
--> Total pay by[](http://) age

In [None]:
# Feature sets
X = df[['Age']].values
y = df['TotalPay'].values

# Split in train and test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
# Model
regr = linear_model.LinearRegression()
regr.fit (X_train, y_train)

# The coefficients
print (f'Coefficients: {regr.coef_}')
print (f'Intercept: {regr.intercept_}')

In [None]:
# Train data points and line that fits the train data
plt.scatter(X_train, y_train,  color='blue')
plt.plot(X_train, regr.coef_[0] * X_train + regr.intercept_, '-r')  # gradient * x + intercept
plt.xlabel("Age")
plt.ylabel("Total pay per year")

In [None]:
# Evaluation of the predictions with the test data
y_test_pred = regr.predict(X_test)

print(f'Mean Squared Error (MSE): {mean_squared_error(y_test, y_test_pred)}')
print(f'R2 Score: {r2_score(y_test , y_test_pred)}')

## Multiple Linear Regression
--> Total pay per age, gender, ...

In [None]:
# Feature sets
X = df.loc[:, ~df.columns.isin(['BasePay', 'Bonus', 'JobTitle', 'Dept', 'TotalPay'])].values
y = df['TotalPay'].values

print('Independent variables: {}'.format(df.loc[:, ~df.columns.isin(['BasePay', 'Bonus', 'JobTitle', 'Dept', 'TotalPay'])].columns.values))

# Split in train and test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
# Model
regr = linear_model.LinearRegression()
regr.fit (X_train, y_train)

# The coefficients
print (f'Coefficients: {regr.coef_}')
print (f'Intercept: {regr.intercept_}')

In [None]:
# Model
# Evaluation of the predictions with the test data
y_test_pred = regr.predict(X_test)

print(f'Mean Squared Error (MSE): {mean_squared_error(y_test, y_test_pred)}')
print(f'R2 Score: {r2_score(y_test , y_test_pred)}')

In [None]:
# Predict with manually created data: 27 years, 5 performance evaluation score, > college education, 3 years of experience, Female

employee_data = [[27, 5, 1, 3, 1]]
salary = regr.predict(employee_data)
print(f'Estimated salary: {salary[0]:.2f} USD')

## Polynomial Regression

In [None]:
# Feature sets
X = df[['Age']].values
y = df['TotalPay'].values

# Split in train and test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

# Generate polynomial features --> Will give us multiple values, we can use them for Multiple Linear Regression
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
print ('Train set polynomial:', X_train_poly.shape,  y_train.shape)

In [None]:
# From now on it will be like a Multiple Linear Regression
# Model
regr = linear_model.LinearRegression()
regr.fit (X_train_poly, y_train)

# The coefficients
print (f'Coefficients: {regr.coef_}')
print (f'Intercept: {regr.intercept_}')

In [None]:
# Train data points and polynomial line that fits the train data
plt.scatter(X_train, y_train,  color='blue')
plt.plot(X_train, regr.intercept_ + regr.coef_[1] * X_train + regr.coef_[2] * np.power(X_train, 2), '-r') # ŷ = interc + coef1 * x + coef2 * x^2
plt.xlabel("Age")
plt.ylabel("Total pay per year")