# Homework 2 - Naive Bayes and SVMs

## Data Preprocessing

In [1]:
#Author - Xiao Bao Bao
#ECS 171 Tuesday, July 20, 2021

import tensorflow as tf
import numpy as np
import pandas as pd
import cvxpy as cp

#Import CSV and do categorical preprocessing
cols = list(pd.read_csv('garments_worker_productivity.csv', nrows=1))
df = pd.read_csv('garments_worker_productivity.csv', usecols = [i for i in cols if i != 'date'])
df.head(5)

Unnamed: 0,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   quarter                1197 non-null   object 
 1   department             1197 non-null   object 
 2   day                    1197 non-null   object 
 3   team                   1197 non-null   int64  
 4   targeted_productivity  1197 non-null   float64
 5   smv                    1197 non-null   float64
 6   wip                    691 non-null    float64
 7   over_time              1197 non-null   int64  
 8   incentive              1197 non-null   int64  
 9   idle_time              1197 non-null   float64
 10  idle_men               1197 non-null   int64  
 11  no_of_style_change     1197 non-null   int64  
 12  no_of_workers          1197 non-null   float64
 13  actual_productivity    1197 non-null   float64
dtypes: float64(6), int64(5), object(3)
memory usage: 131.0+ 

In [3]:
#Categorical features: day, quarter, department, team
days = list(set(df.day))
quarters = list(set(df.quarter))
depts = list(set(df.department))
teams = list(set(df.team))

print(days, quarters, depts, teams, sep="\n")

['Monday', 'Thursday', 'Wednesday', 'Tuesday', 'Sunday', 'Saturday']
['Quarter1', 'Quarter4', 'Quarter5', 'Quarter3', 'Quarter2']
['finishing ', 'sweing', 'finishing']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [4]:
df.groupby(df.columns, 1, sort=False).first()


#column = pd.Series(df['department']).str.replace('sweing ', 'sewing')

df['department'] = df['department'].astype(str).str.replace('sweing', 'sewing')
df['department'] = df['department'].astype(str).str.replace('finishing ', 'finishing')

#Add target column 'satisfied'
zeros = pd.Series(np.zeros(df.shape[0],))
df['satisfied'] = zeros

def eval_satisfied(actual, targeted):
    if(actual  >= targeted):
        return 1
    else:
        return 0
    
for index, row in df.iterrows():
    actual = row['actual_productivity']
    target = row['targeted_productivity']
    #print(act_perf, tar_perf)
    #print(row['satisfied'])
    #print(eval_satisfied(actual, target))
    df.at[index, 'satisfied'] = eval_satisfied(actual, target)
    
#replace all NaN values as zero with numpy
df['wip'] = df['wip'].replace(np.nan, 0)
df.drop(['targeted_productivity', 'actual_productivity'], axis=1, inplace=True)

df.head()

Unnamed: 0,quarter,department,day,team,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,satisfied
0,Quarter1,sewing,Thursday,8,26.16,1108.0,7080,98,0.0,0,0,59.0,1.0
1,Quarter1,finishing,Thursday,1,3.94,0.0,960,0,0.0,0,0,8.0,1.0
2,Quarter1,sewing,Thursday,11,11.41,968.0,3660,50,0.0,0,0,30.5,1.0
3,Quarter1,sewing,Thursday,12,11.41,968.0,3660,50,0.0,0,0,30.5,1.0
4,Quarter1,sewing,Thursday,6,25.9,1170.0,1920,50,0.0,0,0,56.0,1.0


## Naive Bayes Classifiers

In [5]:
#Categorical feature remapping
def encode_team_num(num):
    return num - 1

for index, row in df.iterrows():
    df.at[index, 'team'] = encode_team_num(row['team'])

In [6]:
remap_categorical = {
    "quarter": {"Quarter1": 0, "Quarter2": 1, "Quarter3": 2, "Quarter4": 3, "Quarter5": 4},
    "department": {"sewing": 0, "finishing": 1 },
    "day": {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6},
}

#display(remap_categorical)
df.replace(remap_categorical, inplace=True)
display(df.head())

Unnamed: 0,quarter,department,day,team,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,satisfied
0,0,0,3,7,26.16,1108.0,7080,98,0.0,0,0,59.0,1.0
1,0,1,3,0,3.94,0.0,960,0,0.0,0,0,8.0,1.0
2,0,0,3,10,11.41,968.0,3660,50,0.0,0,0,30.5,1.0
3,0,0,3,11,11.41,968.0,3660,50,0.0,0,0,30.5,1.0
4,0,0,3,5,25.9,1170.0,1920,50,0.0,0,0,56.0,1.0


In [7]:
from sklearn.model_selection import train_test_split

X = df[['quarter', 'department', 'day', 'team']]
y = df[['satisfied']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)


In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score
from sklearn.metrics import f1_score, classification_report
from sklearn.naive_bayes import CategoricalNB, MultinomialNB, GaussianNB
from sklearn import preprocessing
from collections import Counter
import matplotlib.pyplot as plt
#import seaborn as sns

feature_encoder = preprocessing.OrdinalEncoder()
target_encoder = preprocessing.LabelEncoder()
cNB = CategoricalNB()

feature_encoder.fit(X_train)
target_encoder.fit(y_train.values.ravel())

cNB.fit(feature_encoder.transform(X_train), target_encoder.transform(y_train.values.ravel()))

y_pred_cnb = cNB.predict(feature_encoder.transform(X_test))

# Categorical Model Performance
#count_misclassified = (y_test != y_pred_cnb).sum()
print("CategoricalNB - Categorical Features")
print("=" * 30)
#print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_cnb)
print('Accuracy: {:.2f}'.format(accuracy))

print(classification_report(y_test, y_pred_cnb))
#print(target_encoder.classes_)


CategoricalNB - Categorical Features
Accuracy: 0.75
              precision    recall  f1-score   support

         0.0       0.66      0.28      0.39        68
         1.0       0.77      0.94      0.85       172

    accuracy                           0.75       240
   macro avg       0.71      0.61      0.62       240
weighted avg       0.74      0.75      0.72       240



In [9]:
#Gaussian Model Performance
from sklearn.preprocessing import MinMaxScaler
#Get numerical features
X = df[['smv', 'wip', 'over_time', 'incentive',
           'idle_time', "idle_men", 'no_of_style_change',
           'no_of_workers']]
y = df[['satisfied']]

#Scale Data with MinMax
minmaxscaler = MinMaxScaler()
X_scaled = minmaxscaler.fit_transform(X)

#Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=21)

In [10]:
#Train and test data
gNB = GaussianNB()
gNB.fit(X_train, 
        target_encoder.transform(y_train.values.ravel()))
y_pred_gNB = gNB.predict(X_test)

# Gaussian Model Performance
#count_misclassified = (y_test != y_pred_cnb).sum()
print("GaussianNB - Numerical Features")
print("=" * 30)
#print('Misclassified samples: {}'.format(count_misclassified))
accuracy = accuracy_score(y_test, y_pred_gNB)
print('Accuracy: {:.2f}'.format(accuracy))

print(classification_report(y_test, y_pred_gNB))
#print(target_encoder.classes_)

GaussianNB - Numerical Features
Accuracy: 0.73
              precision    recall  f1-score   support

         0.0       1.00      0.04      0.08        68
         1.0       0.73      1.00      0.84       172

    accuracy                           0.73       240
   macro avg       0.86      0.52      0.46       240
weighted avg       0.80      0.73      0.63       240



# SVM Classifiers

In [11]:
import tensorflow as tf
import numpy as np
import pandas as pd
import cvxpy as cp
from sklearn import preprocessing

#One Hot encode all categorical elements
X_cat_features = df[['quarter', 'department', 'day', 'team']]
X_num_features = df[['smv', 'wip', 'over_time', 'incentive',
           'idle_time', "idle_men", 'no_of_style_change',
           'no_of_workers']]
y = df[['satisfied']]

X_cat_features

Unnamed: 0,quarter,department,day,team
0,0,0,3,7
1,0,1,3,0
2,0,0,3,10
3,0,0,3,11
4,0,0,3,5
...,...,...,...,...
1192,1,1,2,9
1193,1,1,2,7
1194,1,1,2,6
1195,1,1,2,8


In [12]:
onehotenc = preprocessing.OneHotEncoder()

onehotenc.fit(X_cat_features)

onehotlabels = onehotenc.transform(X_cat_features).toarray()
onehotlabels[0]

array([1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0.])

In [13]:
onehotlabels.shape

(1197, 25)

In [14]:
#Scale Data with MinMax
minmaxscaler = MinMaxScaler()
X_nums_scaled = minmaxscaler.fit_transform(X_num_features)

X_nums_scaled.shape

(1197, 8)

In [15]:
X_nums_scaled[0]

array([0.45025165, 0.04791973, 0.27314815, 0.02722222, 0.        ,
       0.        , 0.        , 0.65517241])

In [16]:
#Merge categorical and numerically scaled data

right_transpose = X_nums_scaled.transpose()
right_transpose.shape

left = pd.DataFrame(onehotlabels)
right = pd.DataFrame(X_nums_scaled)

#Join categorical and numerical Data Frames
target_df = left.join(right, lsuffix='_left', rsuffix='_right')

#Split and shuffle data
X_train, X_test, y_train, y_test = train_test_split(target_df, y, 
                                                    test_size=0.2, random_state=21)


In [17]:
#SVM with Linear Kernaling
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report

linear_clf = SVC(kernel='linear', decision_function_shape='ovo')
linear_clf.fit(X_train, np.asarray(y_train.values.ravel()))
y_predicted = linear_clf.predict(X_test)

print("Linear Kernal - Test")
print("=" * 30)

accuracy = accuracy_score(y_test, y_predicted)
print('Accuracy: {:.2f}'.format(accuracy))
print(classification_report(y_test, y_predicted))

Linear Kernal - Test
Accuracy: 0.73
              precision    recall  f1-score   support

         0.0       1.00      0.04      0.08        68
         1.0       0.73      1.00      0.84       172

    accuracy                           0.73       240
   macro avg       0.86      0.52      0.46       240
weighted avg       0.80      0.73      0.63       240



In [18]:
#SVM with RBF Kernaling
rbf_clf = SVC(kernel='rbf', gamma = 5.0, decision_function_shape='ovo')
rbf_clf.fit(X_train, np.asarray(y_train.values.ravel()))
y_predicted = rbf_clf.predict(X_test)

print("RBF Kernal - Test")
print("=" * 30)

accuracy = accuracy_score(y_test, y_predicted)
print('Accuracy: {:.2f}'.format(accuracy))
print(classification_report(y_test, y_predicted))

RBF Kernal - Test
Accuracy: 0.71
              precision    recall  f1-score   support

         0.0       0.48      0.19      0.27        68
         1.0       0.74      0.92      0.82       172

    accuracy                           0.71       240
   macro avg       0.61      0.55      0.55       240
weighted avg       0.67      0.71      0.67       240



In [19]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler, SMOTE
from matplotlib import pyplot

#Create frequency table for 'satisfied' in training set
table = pd.crosstab(index=y_train.values.ravel(), columns='count')
print("Frequency: 'Satisfied'")
display(table)

Frequency: 'Satisfied'


col_0,count
row_0,Unnamed: 1_level_1
0.0,254
1.0,703


In [20]:
smote = SMOTE(random_state=21)
ros = RandomOverSampler(random_state=21)

X_res_sm, y_res_sm = smote.fit_resample(X_train, y_train)
X_res_ros, y_res_ros = ros.fit_resample(X_train, y_train) #produces higher accuracy over SMOTE

table1 = pd.crosstab(index=y_res_sm.values.ravel(), columns='count')
print("Frequency SMOTE: 'Satisfied'")
display(table1)

table2 = pd.crosstab(index=y_res_ros.values.ravel(), columns='count')
print("Frequency Random: 'Satisfied'")
display(table2)

Frequency SMOTE: 'Satisfied'


col_0,count
row_0,Unnamed: 1_level_1
0.0,703
1.0,703


Frequency Random: 'Satisfied'


col_0,count
row_0,Unnamed: 1_level_1
0.0,703
1.0,703


In [21]:
linear_clf.fit(X_res_ros, np.asarray(y_res_ros.values.ravel()))
y_predicted = linear_clf.predict(X_res_ros)

print("Oversampled Linear Kernal - Test")
print("=" * 30)

print(classification_report(y_res_ros, y_predicted))

Oversampled Linear Kernal - Test
              precision    recall  f1-score   support

         0.0       0.67      0.71      0.69       703
         1.0       0.69      0.65      0.67       703

    accuracy                           0.68      1406
   macro avg       0.68      0.68      0.68      1406
weighted avg       0.68      0.68      0.68      1406



In [22]:
rbf_clf.fit(X_res_ros, np.asarray(y_res_ros.values.ravel()))
y_predicted = rbf_clf.predict(X_res_ros)

print("Oversampled RBF Kernal - Test")
print("=" * 30)

print(classification_report(y_res_ros, y_predicted))

Oversampled RBF Kernal - Test
              precision    recall  f1-score   support

         0.0       0.91      0.97      0.94       703
         1.0       0.96      0.90      0.93       703

    accuracy                           0.93      1406
   macro avg       0.94      0.93      0.93      1406
weighted avg       0.94      0.93      0.93      1406

