In [13]:
import pandas as pd
import numpy as np
from fastai_utils import *

In [91]:
tech = pd.read_csv("tech/survey.csv", low_memory=False, sep=',')

# Limpeza

Os dados possuem muitos problemas por haver muitos campos com strings. Além disso, por ser um questionário, algumas perguntas foram de campo aberto (e.g. gênero), gerando dados inconsistentes.

### Seleção de colunas

Primeiramente, para economizar espaço, vamos cortar as colunas que não utilizaremos no projeto.

In [92]:
tech = tech.drop(['comments','state','Timestamp','Country'], axis= 1)

### Limpeza de valores nulos

Para uma boa classificação, não é bom haver valores nulos. Nesta parte, preencheremos os campos nulos com valores padrão do tipo daquela coluna.

In [93]:
defaultInt = 0
defaultString = 'NaN'

intFeatures = ['Age']
stringFeatures = ['Gender', 'self_employed', 'family_history', 'treatment', 'work_interfere',
                 'no_employees', 'remote_work', 'tech_company', 'anonymity', 'leave', 'mental_health_consequence',
                 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview',
                 'mental_vs_physical', 'obs_consequence', 'benefits', 'care_options', 'wellness_program',
                 'seek_help']

for feature in tech:
    if feature in intFeatures:
        tech[feature] = tech[feature].fillna(defaultInt)
    elif feature in stringFeatures:
        tech[feature] = tech[feature].fillna(defaultString)

### Limpeza e rotulação da coluna de gênero

A coluna de gênero contém muitas categorias que podem ser simplificadas. Aqui, realizamos este procedimento categorizando todos os valores dentro de 3 conjuntos: male, female e trans.

In [94]:
male_str = ["male", "m", "male-ish", "maile", "mal", "male (cis)", "make", "male ", "man","msle", "mail", "malr","cis man", "cis male"]
trans_str = ["trans-female", "something kinda male?", "queer/she/they", "non-binary","nah", "all", "enby", "fluid", "genderqueer", "androgyne", "agender", "male leaning androgynous", "guy (-ish) ^_^", "trans woman", "neuter", "female (trans)", "queer", "ostensibly male, unsure what that really means"]           
female_str = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)", "femail"]

for (row, col) in tech.iterrows():

    if str.lower(col.Gender) in male_str:
        tech['Gender'].replace(to_replace=col.Gender, value='male', inplace=True)

    if str.lower(col.Gender) in female_str:
        tech['Gender'].replace(to_replace=col.Gender, value='female', inplace=True)

    if str.lower(col.Gender) in trans_str:
        tech['Gender'].replace(to_replace=col.Gender, value='trans', inplace=True)

stk_list = ['A little about you', 'p']
tech = tech[~tech['Gender'].isin(stk_list)]

### Limpeza da coluna de idade

Existem alguns valores irreais em idade, como números negativos e números muito elevados. Visto isso, tratamos esses outliers igualando-os à média dos valores do conjunto.

In [95]:
tech['Age'].fillna(tech['Age'].median(), inplace = True)

s = pd.Series(tech['Age'])
s[(s<18) & (s>100)] = tech['Age'].median()
tech['Age'] = s

### Rotulação de valores nulos

Alguns valores nulos podem ser interpretados baseado em outras respostas do questionário. Assim, alguns valores nulos são populados com outros valores categóricos. 

In [96]:
tech['self_employed'] = tech['self_employed'].replace([defaultString], 'No')
tech['work_interfere'] = tech['work_interfere'].replace([defaultString], 'Don\'t have mental condition')

In [55]:
for n,c in tech.items():
    if is_string_dtype(c): tech[n] = c.astype('category').cat.as_ordered()
for n,c in tech.items():
    print(is_string_dtype(c))

False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [56]:
numericalize(tech,'leave','leave_code',5)

AttributeError: 'str' object has no attribute 'cat'

In [58]:
tech['treatment'].unique()

[Yes, No]
Categories (2, object): [No < Yes]

# Hipóteses

## Apoio

Campos a serem analisados: benefits, care_options, wellness_program, seek_help, anonymity, leave

In [69]:
tech.groupby(['treatment'])['benefits'].value_counts()

treatment  benefits  
No         Don't know    257
           No            193
           Yes           172
Yes        Yes           305
           No            181
           Don't know    151
Name: benefits, dtype: int64

In [70]:
tech.groupby(['treatment'])['care_options'].value_counts()

treatment  care_options
No         No              294
           Not sure        191
           Yes             137
Yes        Yes             307
           No              207
           Not sure        123
Name: care_options, dtype: int64

In [71]:
tech.groupby(['treatment'])['wellness_program'].value_counts()

treatment  wellness_program
No         No                  422
           Don't know          107
           Yes                  93
Yes        No                  420
           Yes                 136
           Don't know           81
Name: wellness_program, dtype: int64

In [72]:
tech.groupby(['treatment'])['seek_help'].value_counts()

treatment  seek_help 
No         No            323
           Don't know    197
           Yes           102
Yes        No            323
           Don't know    166
           Yes           148
Name: seek_help, dtype: int64

In [73]:
tech.groupby(['treatment'])['anonymity'].value_counts()

treatment  anonymity 
No         Don't know    448
           Yes           147
           No             27
Yes        Don't know    371
           Yes           228
           No             38
Name: anonymity, dtype: int64

In [74]:
tech.groupby(['treatment'])['leave'].value_counts()

treatment  leave             
No         Don't know            309
           Somewhat easy         135
           Very easy             103
           Somewhat difficult     44
           Very difficult         31
Yes        Don't know            254
           Somewhat easy         131
           Very easy             103
           Somewhat difficult     82
           Very difficult         67
Name: leave, dtype: int64

## Pessoal

In [133]:
tech.groupby(['Gender'])['work_interfere'].value_counts()

Gender  work_interfere             
0       Sometimes                      346
        Don't have mental condition    227
        Never                          187
        Rarely                         126
        Often                          105
1       Sometimes                      108
        Rarely                          44
        Don't have mental condition     36
        Often                           34
        Never                           25
2       Sometimes                       11
        Often                            3
        Rarely                           3
        Don't have mental condition      1
        Never                            1
Name: work_interfere, dtype: int64

In [110]:
tech.groupby(['Gender'])['mental_health_consequence'].value_counts()

Gender  mental_health_consequence
female  Maybe                        108
        No                            73
        Yes                           66
male    No                           413
        Maybe                        364
        Yes                          214
trans   Yes                           10
        Maybe                          5
        No                             4
Name: mental_health_consequence, dtype: int64

# Classificação

In [122]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
gnb = GaussianNB()
m = RandomForestClassifier(n_jobs=-1)

### Quantificar variáveis categóricas

In [135]:
for (row, col) in tech.iterrows():

    if col.Gender == 'male':
        tech['Gender'].replace(to_replace=col.Gender, value=0, inplace=True)
    if col.Gender == 'female':
        tech['Gender'].replace(to_replace=col.Gender, value=1, inplace=True)
    if col.Gender == 'trans':
        tech['Gender'].replace(to_replace=col.Gender, value=2, inplace=True)
    if col.mental_health_consequence == 'No':
        tech['mental_health_consequence'].replace(to_replace=col.mental_health_consequence, value=0, inplace=True)
    if col.mental_health_consequence == 'Maybe':
        tech['mental_health_consequence'].replace(to_replace=col.mental_health_consequence, value=1, inplace=True)
    if col.mental_health_consequence == 'Yes':
        tech['mental_health_consequence'].replace(to_replace=col.mental_health_consequence, value=2, inplace=True)
    if col.family_history == 'Yes':
        tech['family_history'].replace(to_replace=col.family_history, value=1, inplace=True)
    if col.family_history == 'No':
        tech['family_history'].replace(to_replace=col.family_history, value=0, inplace=True)
    if col.treatment == 'Yes':
        tech['treatment'].replace(to_replace=col.treatment, value=1, inplace=True)
    if col.treatment == 'No':
        tech['treatment'].replace(to_replace=col.treatment, value=0, inplace=True)
    if col.leave == 'Don\'t know':
        tech['leave'].replace(to_replace=col.leave, value=0, inplace=True)
    if col.leave == 'Very easy':
        tech['leave'].replace(to_replace=col.leave, value=1, inplace=True)
    if col.leave == 'Somewhat easy':
        tech['leave'].replace(to_replace=col.leave, value=2, inplace=True)
    if col.leave == 'Somewhat difficult':
        tech['leave'].replace(to_replace=col.leave, value=3, inplace=True)
    if col.leave == 'Very difficult':
        tech['leave'].replace(to_replace=col.leave, value=4, inplace=True)
    if col.work_interfere == 'Never':
        tech['work_interfere'].replace(to_replace=col.work_interfere, value='No', inplace=True)
    if col.work_interfere == 'Sometimes':
        tech['work_interfere'].replace(to_replace=col.work_interfere, value='Yes', inplace=True)
    if col.work_interfere == 'Often':
        tech['work_interfere'].replace(to_replace=col.work_interfere, value='Yes', inplace=True)
    if col.work_interfere == 'Rarely':
        tech['work_interfere'].replace(to_replace=col.work_interfere, value='Yes', inplace=True)
    if col.work_interfere == 'Don\'t have mental condition':
        tech['work_interfere'].replace(to_replace=col.work_interfere, value='No', inplace=True)

### Treinamento

In [136]:
X = tech[['mental_health_consequence','treatment','family_history','leave']]
Y = tech['work_interfere']

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

y_pred_nb = gnb.fit(X_train, Y_train).predict(X_test)

y_pred_rf = m.fit(X_train, Y_train).score(X_test, Y_test)

In [137]:
sum(y_pred_nb == Y_test) / len(y_pred_nb)

0.834920634920635

In [138]:
y_pred_rf

0.8380952380952381