In [37]:
import numpy as np
import pandas as pd

In [38]:
df = pd.read_csv("../data/raw/wfh.csv")
df.head()

Unnamed: 0,id,distance_from_office,salary_range,gas_price_per_litre,public_transportation_cost,wfh_prev_workday,workday,tenure,work_home_actual
0,0,5.962247,40K - 60K,2.119485,8.568058,False,Friday,0.212653,1
1,1,0.535872,40K - 60K,2.357199,5.425382,True,Tuesday,4.927549,0
2,2,1.969519,40K - 60K,2.366849,8.247158,False,Monday,0.520817,1
3,3,2.53041,20K - 40K,2.318722,7.944251,False,Tuesday,0.453649,1
4,4,2.253635,60K+,2.221265,8.884478,True,Thursday,5.695263,1


In [39]:
df.shape

(50000, 9)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          50000 non-null  int64  
 1   distance_from_office        50000 non-null  float64
 2   salary_range                50000 non-null  object 
 3   gas_price_per_litre         50000 non-null  float64
 4   public_transportation_cost  50000 non-null  float64
 5   wfh_prev_workday            50000 non-null  bool   
 6   workday                     50000 non-null  object 
 7   tenure                      50000 non-null  float64
 8   work_home_actual            50000 non-null  int64  
dtypes: bool(1), float64(4), int64(2), object(2)
memory usage: 3.1+ MB


In [41]:
df.describe()

Unnamed: 0,id,distance_from_office,gas_price_per_litre,public_transportation_cost,tenure,work_home_actual
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,24999.5,3.929033,2.049616,7.323907,4.60004,0.49958
std,14433.901067,4.079528,0.334385,1.63039,2.301937,0.500005
min,0.0,0.00221,1.400369,4.003417,0.002253,0.0
25%,12499.75,0.897909,1.769163,6.109763,2.797947,0.0
50%,24999.5,2.380855,2.189073,8.074422,5.584845,0.0
75%,37499.25,5.679015,2.337894,8.627489,6.531917,1.0
max,49999.0,19.912896,2.399916,8.998112,6.997167,1.0


In [42]:
df_cleaned = df.copy()

In [43]:
y = df_cleaned.pop('work_home_actual')

In [44]:
y

0        1
1        0
2        1
3        1
4        1
        ..
49995    1
49996    0
49997    1
49998    0
49999    1
Name: work_home_actual, Length: 50000, dtype: int64

In [45]:
from sklearn.preprocessing import OrdinalEncoder

In [46]:
ord_enc = OrdinalEncoder(categories = [['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']])

In [47]:
df_cleaned['workday'] = ord_enc.fit_transform(df_cleaned[['workday']])

In [48]:
df_cleaned = pd.get_dummies(df_cleaned, columns = ['salary_range'])

In [49]:
df_cleaned.drop(['id'], axis = 1, inplace = True)

In [50]:
df_cleaned.head()

Unnamed: 0,distance_from_office,gas_price_per_litre,public_transportation_cost,wfh_prev_workday,workday,tenure,salary_range_0 - 20K,salary_range_20K - 40K,salary_range_40K - 60K,salary_range_60K+
0,5.962247,2.119485,8.568058,False,4.0,0.212653,False,False,True,False
1,0.535872,2.357199,5.425382,True,1.0,4.927549,False,False,True,False
2,1.969519,2.366849,8.247158,False,0.0,0.520817,False,False,True,False
3,2.53041,2.318722,7.944251,False,1.0,0.453649,False,True,False,False
4,2.253635,2.221265,8.884478,True,3.0,5.695263,False,False,False,True


In [51]:
from sklearn.model_selection import train_test_split

In [52]:
X_data, X_test, y_data, y_test =  train_test_split(df_cleaned, y, test_size = 0.2, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size = 0.2, random_state = 42)

In [53]:
print(X_train.shape)

(32000, 10)


In [54]:
print(X_val.shape)

(8000, 10)


In [55]:
print(X_test.shape)

(10000, 10)


In [56]:
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(32000,)
(8000,)
(10000,)


In [57]:
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_val.to_csv('../data/processed/X_val.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_val.to_csv('../data/processed/y_val.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

In [58]:
from sklearn.dummy import DummyClassifier

In [59]:
base_clf = DummyClassifier(strategy = "most_frequent")

In [60]:
base_clf.fit(X_train, y_train)

In [61]:
from sklearn.metrics import roc_auc_score

In [63]:
y_proba_preds = base_clf.predict_proba(X_train)
roc_auc_score(y_train, y_proba_preds[:, 1])

np.float64(0.5)