In [1]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
import pandas  as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
plt.rcParams['figure.figsize'] = [30,15]

In [2]:
df = pd.read_csv('garments_worker_productivity.csv')
df = df.dropna()
df.head()

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382
5,1/1/2015,Quarter1,sweing,Thursday,7,0.8,25.9,984.0,6720,38,0.0,0,0,56.0,0.800125


In [3]:
del df['date']

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 691 entries, 0 to 1191
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   quarter                691 non-null    object 
 1   department             691 non-null    object 
 2   day                    691 non-null    object 
 3   team                   691 non-null    int64  
 4   targeted_productivity  691 non-null    float64
 5   smv                    691 non-null    float64
 6   wip                    691 non-null    float64
 7   over_time              691 non-null    int64  
 8   incentive              691 non-null    int64  
 9   idle_time              691 non-null    float64
 10  idle_men               691 non-null    int64  
 11  no_of_style_change     691 non-null    int64  
 12  no_of_workers          691 non-null    float64
 13  actual_productivity    691 non-null    float64
dtypes: float64(6), int64(5), object(3)
memory usage: 81.0+ KB

In [5]:
# One hot encoding the dataframe
# One hot encoding is the process of converting the categorical data variables to be provided to machine and deep learning algorithms which in turn improve predictions as well as classification accuracy of a model
data = pd.get_dummies(df, columns =['quarter','department','day'])

In [6]:
X = data.loc[:, data.columns != 'actual_productivity']
y = data['actual_productivity']

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.3)

In [9]:
svr = SVR()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)

In [10]:
# Exploring the input & output

# Input: A two-dimensional array which contains all the features of the wine dataset
print("Input : ", X_test[:5])

# Output: The ouput array is the prediction
print("Expected output : ", list(y_test)[:10])
print("Predicted output : ", list(y_pred)[:10])

Input :  [[ 1.00670935  0.25527123  0.69348406 -0.06778732  1.40502703  0.88903742
  -0.0757348  -0.14943232 -0.48527217  0.64335005 -0.66301081  1.63570533
  -0.47910058 -0.50406694 -0.18555629  0.         -0.44915379 -0.42091812
  -0.44915379  2.20361798 -0.45379917 -0.45611617]
 [-1.01885351 -1.2163007   0.98323738  0.09287713  0.15783588 -0.52520432
  -0.0757348  -0.14943232  1.37763376  0.59021812 -0.66301081 -0.61135706
   2.08724439 -0.50406694 -0.18555629  0.         -0.44915379 -0.42091812
   2.22640889 -0.45379917 -0.45379917 -0.45611617]
 [-0.72948739  0.74579521  0.98323738 -0.47244388 -0.84830151  0.20004785
  -0.0757348  -0.14943232  1.37763376  0.69648199 -0.66301081 -0.61135706
  -0.47910058  1.98386347 -0.18555629  0.         -0.44915379 -0.42091812
  -0.44915379 -0.45379917 -0.45379917  2.19242393]
 [-0.72948739 -2.19734866  0.48979608 -0.34554619  0.15783588 -0.77904259
  -0.0757348  -0.14943232  3.24053969  0.59021812 -0.66301081 -0.61135706
  -0.47910058  1.9838634

In [11]:
# Calculating the Accuracy
# Accuracy of a linear model can't be calculated, the accuracy is predicted using error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Mean Squared Error
print("Mean Squared Error:",mean_squared_error(y_test, y_pred)) # tells you how close a regression line is to a set of points
# squaring is necessary to remove any negative signs

# Root Mean Squared Error
print("Root Mean Squared Error:", mean_squared_error(y_test, y_pred, squared=False))

# R2 Score
print("R^2 Score:", r2_score(y_test,y_pred)) # correlation between actual and predicted value

Mean Squared Error: 0.008728453493548278
Root Mean Squared Error: 0.09342619275956972
R^2 Score: 0.6516588248437551
