In [1]:
from factors_calculator import FactorsCalculator
import pandas as pd
URL_DATA = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(URL_DATA)

# Factorize binary column to avoid collinearity
data['is_female'] = data.Sex.apply(lambda x: 1 if x=='female' else 0)

# Adding 'bad' features on purpose to reflect in the respective impact factors
numerical_cols = ['Age', 'Fare', 'PassengerId', 'SibSp', 'Parch', 'is_female', 'Pclass']
categorical_cols = ['Embarked', 'Name', 'Ticket', 'Cabin']
target_col = 'Survived'

fc = FactorsCalculator(
    data,
    numerical_cols,
    categorical_cols,
    target_col,
    factorize_target=False, # Set to true when target column has strings and need to be converted to int
    random_st=None # Run multiple times to check effect of random split 
)

In [2]:
fc.fit_report()

Score: 0.757847533632287
              precision    recall  f1-score   support

           0       0.76      0.82      0.79       124
           1       0.75      0.68      0.71        99

    accuracy                           0.76       223
   macro avg       0.76      0.75      0.75       223
weighted avg       0.76      0.76      0.76       223



In [3]:
# Valid for Logistic Regression
coef = fc.get_coefficients()
print(coef)

                             Survived
is_female                    1.336052
Pclass                      -0.757277
Age                         -0.571270
SibSp                       -0.386499
Embarked_infrequent_sklearn  0.134040
Parch                       -0.115714
Embarked_S                  -0.106764
Cabin_missing_value         -0.104080
Cabin_infrequent_sklearn     0.104080
PassengerId                  0.084453
Embarked_C                   0.081567
Fare                         0.080392
Embarked_Q                   0.038183
Name_infrequent_sklearn      0.000000
Ticket_infrequent_sklearn    0.000000


In [4]:
ADULT_SAMPLE = 'https://s3.amazonaws.com/fast-ai-sample/adult_sample.tgz'

In [11]:
!wget {ADULT_SAMPLE}

--2022-05-20 23:43:15--  https://s3.amazonaws.com/fast-ai-sample/adult_sample.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.195.240
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.195.240|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 968212 (946K) [application/x-tar]
Saving to: ‘adult_sample.tgz’


2022-05-20 23:43:16 (1,28 MB/s) - ‘adult_sample.tgz’ saved [968212/968212]



In [12]:
!tar -xvzf adult_sample.tgz

x adult_sample/
x adult_sample/export.pkl
x adult_sample/adult.csv
x adult_sample/models/
x adult_sample/models/mini_train.pth


In [13]:
data = pd.read_csv('adult_sample/adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  int64  
 1   workclass       32561 non-null  object 
 2   fnlwgt          32561 non-null  int64  
 3   education       32561 non-null  object 
 4   education-num   32074 non-null  float64
 5   marital-status  32561 non-null  object 
 6   occupation      32049 non-null  object 
 7   relationship    32561 non-null  object 
 8   race            32561 non-null  object 
 9   sex             32561 non-null  object 
 10  capital-gain    32561 non-null  int64  
 11  capital-loss    32561 non-null  int64  
 12  hours-per-week  32561 non-null  int64  
 13  native-country  32561 non-null  object 
 14  salary          32561 non-null  object 
dtypes: float64(1), int64(5), object(9)
memory usage: 3.7+ MB


In [22]:
num_cols = list(set(data.columns)-set(cat_cols))
num_cols

['hours-per-week',
 'capital-gain',
 'education-num',
 'capital-loss',
 'age',
 'salary',
 'fnlwgt']

In [23]:
cat_cols = ['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country']

num_cols = ['hours-per-week', 'capital-gain', 'education-num', 'capital-loss', 'age', 'fnlwgt']

target_col = 'salary'


fc = FactorsCalculator(
    data,
    num_cols,
    cat_cols,
    target_col,
    factorize_target=False, # Set to true when target column has strings and need to be converted to int
    random_st=None # Run multiple times to check effect of random split 
)


In [24]:
fc.fit_report()

Score: 0.8467018793759981
              precision    recall  f1-score   support

        <50k       0.87      0.93      0.90      6155
       >=50k       0.74      0.58      0.65      1986

    accuracy                           0.85      8141
   macro avg       0.80      0.76      0.78      8141
weighted avg       0.84      0.85      0.84      8141



In [25]:
# Valid for Logistic Regression
coef = fc.get_coefficients()
coef

                                      salary
capital-gain                        2.325106
marital-status_ Married-civ-spouse  0.930972
education-num                       0.760997
marital-status_ Never-married      -0.652991
age                                 0.362165
hours-per-week                      0.357156
relationship_ Own-child            -0.323926
relationship_ Not-in-family         0.293213
marital-status_ Divorced           -0.285335
capital-loss                        0.260876
occupation_ Other-service          -0.259686
occupation_ Exec-managerial         0.232298
marital-status_infrequent_sklearn  -0.223631
relationship_ Husband              -0.202383
relationship_infrequent_sklearn     0.181784
sex_ Male                           0.166851
sex_ Female                        -0.166851
occupation_ Prof-specialty          0.147703
relationship_ Unmarried             0.130520
workclass_ Self-emp-not-inc        -0.122109
occupation_ Machine-op-inspct      -0.088472
workclass_

In [26]:
coef

Unnamed: 0,salary
capital-gain,2.325106
marital-status_ Married-civ-spouse,0.930972
education-num,0.760997
marital-status_ Never-married,-0.652991
age,0.362165
hours-per-week,0.357156
relationship_ Own-child,-0.323926
relationship_ Not-in-family,0.293213
marital-status_ Divorced,-0.285335
capital-loss,0.260876
