In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn import tree

from sklearn.metrics import roc_curve, auc

In [43]:
df = pd.read_csv('trainms.csv')

In [44]:
df['work_interfere']=df['work_interfere'].fillna("Maybe")
df['self_employed']=df['self_employed'].fillna("Dont Know")

In [45]:
temp = df.Gender
temp = temp.replace(to_replace=["A little about you","something kinda male?","Enby","fluid","Genderqueer","Trans-female","Nah","Androgyne","Agender","male leaning androgynous","Trans woman","Neuter","Female (trans)","queer","non-binary"],value="T")
temp = temp.replace(["Female","female","Cis Female","F","Woman","f","Femake","Female ","woman","cis-female/femme","Female (cis)"],'F')
temp = temp.replace(["M","Male","male","m","Male-ish","maile","Cis Male","Mal","Male ","Make","Guy (-ish) ^_^","Man","msle","Mail","cis male"],'M')
temp.unique()
df.Gender = temp

In [46]:
to_drop_columns = ['state','comments','s.no','Timestamp']+['supervisor','coworkers','Country','anonymity','self_employed'] #+['Country','supervisor','coworkers','phys_health_interview','anonymity','self_employed','remote_work','tech_company','obs_consequence','phys_health_consequence','seek_help','Gender']

In [47]:
df = df.drop(columns=to_drop_columns)

In [48]:
# target = df.pop('treatment')
df.columns

Index(['Age', 'Gender', 'family_history', 'treatment', 'work_interfere',
       'no_employees', 'remote_work', 'tech_company', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'leave',
       'mental_health_consequence', 'phys_health_consequence',
       'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence'],
      dtype='object')

In [49]:
le = preprocessing.LabelEncoder()
features = df.apply(le.fit_transform)

onc = preprocessing.OneHotEncoder(categorical_features=[1,2])
features = onc.fit_transform(features).toarray()

labels,uniques = features["treatment"] # pd.factorize(target)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [51]:
features

array([[1., 0., 0., ..., 0., 2., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 2., 1., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.]])

In [38]:
X_train, X_test, y_train, y_test = train_test_split( features, labels, test_size = 0.3, random_state = 100)

In [39]:
clf_gini = DecisionTreeClassifier(criterion = "gini",
                               max_depth=4, min_samples_leaf=0.3)
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [None]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy",max_depth=4,presort=True)
clf_entropy.fit(X_train, y_train)

In [40]:
gini_pred = clf_gini.predict(X_train)
accuracy_score(y_train,gini_pred)*100

69.42857142857143

In [None]:
entropy_pred = clf_entropy.predict(X_train)
accuracy_score(y_train,entropy_pred)*100

In [41]:
gini_pred = clf_gini.predict(X_test)
accuracy_score(y_test,gini_pred)*100

67.0

In [None]:
entropy_pred = clf_entropy.predict(X_test)
accuracy_score(y_test,entropy_pred)*100

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, entropy_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

randoms = np.linspace(1, 1000, 100, endpoint=True,dtype=int)
x_train,x_test = X_train,X_test

train_results = []
test_results = []

for random in randoms:
    dt = DecisionTreeClassifier(random_state=random)
    dt.fit(x_train, y_train)
    train_pred = dt.predict(x_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(x_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(randoms, train_results, "b", label="Train AUC")
line2, = plt.plot(randoms, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("Tree depth")
plt.show()

In [None]:
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
print(min_samples_leafs)
train_results = []
test_results = []
for min_samples_leaf in min_samples_leafs:
    dt = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
    dt.fit(x_train, y_train)
    train_pred = dt.predict(x_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(x_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
    
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(min_samples_leafs, train_results, "b", label="Train AUC")
line2, = plt.plot(min_samples_leafs, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("min samples leaf")
plt.show()

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

max_features = list(range(1,X_train.shape[1]))
x_train,x_test = X_train,X_test

train_results = []
test_results = []

for max_feature in max_features:
    dt = DecisionTreeClassifier(max_features=max_feature)
    dt.fit(x_train, y_train)
    train_pred = dt.predict(x_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(x_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_features, train_results, "b", label="Train AUC")
line2, = plt.plot(max_features, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("Max Featues")
plt.show()

In [None]:
test = pd.read_csv('testms.csv')
temp = test.Age
temp = temp.replace(to_replace=-1,value=22)
test.Age = temp

test['work_interfere']=test['work_interfere'].fillna("Maybe")
test['self_employed']=test['self_employed'].fillna("Dont Know")

In [None]:
#Validate test gender
temp = test.Gender
temp = temp.replace(to_replace=["Male","male","m","Malr","Male ","Cis Man"],value='M')
temp = temp.replace(to_replace=["female","F","Female","Woman","femail","f"],value='F')
temp = temp.replace(to_replace=["p","Female (trans)","ostensibly male, unsure what that really means"],value='T')
test.Gender = temp

In [None]:
#predict testing data
# col = to_drop_columns
# col.remove('treatment')
test_features = test.drop(columns=to_drop_columns)
le = preprocessing.LabelEncoder()
test_features = test_features.apply(le.fit_transform)

pred = clf_entropy.predict(test_features)

In [None]:
# Submission generation
index = [str(i) for i in range(1,len(pred)+1)]
values = ['Yes' if val==0 else 'No' for i,val in enumerate(pred)]
submission_result = pd.DataFrame(values, index=index,columns=['treatment'])
submission_result.index.name = "s.no"

file_name="submission.csv"
submission_result.to_csv(file_name, sep=',')