In [209]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score

### 1. Problem Statement  

A company wants to predict employee productivity scores to improve workforce planning and training programs. You are hired as a Data Scientist to build a multivariate linear regression model that predicts an employeeâ€™s Productivity Score based on multiple work-related factors. 

Experience (yrs),Training Hours,Working Hours,Projects,Productivity Score 
2,40,38,3,62 
5,60,42,6,78 
1,20,35,2,55 
8,80,45,8,88 
4,50,40,5,72 
10,90,48,9,92 
3,30,37,4,65 
6,70,44,7,82 
7,75,46,7,85 
2,25,36,3,60 

Interpretation 

1. Which factor most strongly impacts productivity?
2. How does training affect productivity?
3. Should the company increase training hours or working hours?
4. What happens if Working Hours increase beyond optimal limits?
5. Can productivity ever decrease with more experience?
6. How would you detect overfitting in this model?
7. Suggest one new feature to improve prediction accuracy. 

In [128]:
# upload the data

# update the data in the dictonary
emp_prod_dic = {'Experience (yrs)':[2,5,1,8,4,10,3,6,7,2],'Training Hrs.' : [40,60,20,80,50,90,30,70,75,25], 
                'Working Hrs.': [38,42,35,45,40,48,37,44,46,36], 'Projects':[3,6,2,8,5,9,4,7,7,3],
                'Productivity Score':[62,78,55,88,72,92,65,82,85,60]}

# Update the data in the data frame

emp_prod_df = pd.DataFrame(emp_prod_dic)

In [129]:
emp_prod_df.head()

Unnamed: 0,Experience (yrs),Training Hrs.,Working Hrs.,Projects,Productivity Score
0,2,40,38,3,62
1,5,60,42,6,78
2,1,20,35,2,55
3,8,80,45,8,88
4,4,50,40,5,72


In [130]:
# Correlation among the independent and dependent variables

emp_prod_df.corr()

Unnamed: 0,Experience (yrs),Training Hrs.,Working Hrs.,Projects,Productivity Score
Experience (yrs),1.0,0.96981,0.973178,0.988194,0.981991
Training Hrs.,0.96981,1.0,0.993132,0.977151,0.987045
Working Hrs.,0.973178,0.993132,1.0,0.974701,0.985684
Projects,0.988194,0.977151,0.974701,1.0,0.995696
Productivity Score,0.981991,0.987045,0.985684,0.995696,1.0


In [131]:
# Preparing the data for the regression

x = emp_prod_df.iloc[:,:-1]
y = pd.DataFrame(emp_prod_df.iloc[:,-1])

In [132]:
# Build the regression model
LR = LinearRegression()
reg_mod1 = LR.fit(x, y)

In [133]:
# Predict the predicator

reg_mod1_pred = reg_mod1.predict(x)

In [116]:
# Coefficient of the regression model

reg_mod1_cols = np.array(emp_prod_df.iloc[:,:-1].columns.tolist())
reg_mod1_coeff = reg_mod1.coef_

In [114]:
# Reshaping 2D Array to 1D Array
reg_mod1_coeff= reg_mod1_coeff.reshape(-1)

# Concatenating arrays
np.concatenate((reg_mod1_cols, reg_mod1_coeff))

array(['Experience (yrs)', 'Training Hrs.', 'Working Hrs.', 'Projects',
       '-0.9638009049773737', '0.038009049773755396',
       '0.8190045248868786', '4.701357466063348'], dtype='<U32')

In [124]:
# Predictive accuracy
reg_mod1_r_squared = round(reg_mod1.score(x,y),4)
n = x.shape[0]
p = x.shape[1]
adj_r_squared = round(1-(1-reg_mod1_r_squared)*(n-1)/(n-p-1),4)

print(f'R Squared:{reg_mod1_r_squared} \n Adj R Squared: {adj_r_squared}')

R Squared:0.9972 
 Adj R Squared: 0.995


In [140]:
# RMSE of the model

rmse_mod1 = round(np.sqrt(metrics.mean_squared_error(y, reg_mod1_pred)),4)

print(f'Root mean squared error:{rmse_mod1}')

Root mean squared error:0.6452


# Interpretation

1. Which factor most strongly impacts productivity?
 
A. Projects is the factor which strongly affect on productivity score.

2. How does training affect productivity?

A. It has positive corrletion with productivity correlation
 
4. Should the company increase training hours or working hours?

A. Increase working hours may increase the productivity score.

5. What happens if Working Hours increase beyond optimal limits?

A. There is a probability that productivity score may decrease.

8. Can productivity ever decrease with more experience? 

A. Yes, since experiece has negative coefficient.

9. How would you detect overfitting in this model?

A. The coefficient of project is very high as we as RMSE is very low.

10. Suggest one new feature to improve prediction accuracy. 

A. A Stree realted metric can also help to improve the prediction.