In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn import tree
import pickle
from sklearn.metrics import roc_curve, auc

In [2]:
df = pd.read_csv('trainms.csv')

In [3]:
df['work_interfere']=df['work_interfere'].fillna("Maybe")
df['self_employed']=df['self_employed'].fillna("Dont Know")

In [4]:
temp = df.Gender
temp = temp.replace(to_replace=["A little about you","something kinda male?","Enby","fluid","Genderqueer","Trans-female","Nah","Androgyne","Agender","male leaning androgynous","Trans woman","Neuter","Female (trans)","queer","non-binary"],value="T")
temp = temp.replace(["Female","female","Cis Female","F","Woman","f","Femake","Female ","woman","cis-female/femme","Female (cis)"],'F')
temp = temp.replace(["M","Male","male","m","Male-ish","maile","Cis Male","Mal","Male ","Make","Guy (-ish) ^_^","Man","msle","Mail","cis male"],'M')
temp.unique()
df.Gender = temp

In [None]:
file_name="test_updated.csv"
df.to_csv(file_name, sep=',')

In [5]:
to_drop_columns = ['state','comments','s.no']+['Age','Timestamp','no_employees','leave']+['supervisor','coworkers','Country','anonymity','self_employed'] #+['Country','supervisor','coworkers','phys_health_interview','anonymity','self_employed','remote_work','tech_company','obs_consequence','phys_health_consequence','seek_help','Gender']

In [6]:
features = df.drop(columns=to_drop_columns)

In [None]:
features.columns

In [7]:
target = features.pop('treatment')
colmns = features.columns

In [8]:
for column in colmns:
    features = pd.concat([features,pd.get_dummies(features[column], prefix=column)],axis=1)
    features.drop([column],axis=1, inplace=True)

In [9]:
labels,_ = pd.factorize(target)

In [None]:
le = preprocessing.LabelEncoder()
features = features.apply(le.fit_transform)

In [None]:
features.shape

In [10]:
X_train, X_test, y_train, y_test = train_test_split( features, labels, test_size = 0.3, random_state = 100)



In [None]:
# X_train['work_interfere'].value_counts()
X_train.shape

In [11]:
# gini model fitting

clf_gini = DecisionTreeClassifier(criterion = "gini")
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [14]:
# entropy model fitting
clf_entropy = DecisionTreeClassifier(criterion = "entropy",max_depth=4,presort=True)
clf_entropy.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=True,
                       random_state=None, splitter='best')

In [12]:
# predicting gini train
gini_pred = clf_gini.predict(X_train)
accuracy_score(y_train,gini_pred)*100

99.57142857142857

In [15]:
# predicting entropy train
entropy_pred = clf_entropy.predict(X_train)
accuracy_score(y_train,entropy_pred)*100

82.71428571428572

In [13]:
# predicting gini test
gini_pred = clf_gini.predict(X_test)
accuracy_score(y_test,gini_pred)*100

75.33333333333333

In [16]:
# predicting entropy test
entropy_pred = clf_entropy.predict(X_test)
accuracy_score(y_test,entropy_pred)*100

85.66666666666667

In [None]:
# roc score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, entropy_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

max_depths = np.linspace(1, 32, 32, endpoint=True)
x_train,x_test = X_train,X_test

train_results = []
test_results = []

for max_depth in max_depths:
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(x_train, y_train)
    train_pred = dt.predict(x_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(x_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_depths, train_results, "b", label="Train AUC")
line2, = plt.plot(max_depths, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("Tree depth")
plt.show()

In [None]:
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
print(min_samples_leafs)
train_results = []
test_results = []
for min_samples_leaf in min_samples_leafs:
    dt = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
    dt.fit(x_train, y_train)
    train_pred = dt.predict(x_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(x_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
    
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(min_samples_leafs, train_results, "b", label="Train AUC")
line2, = plt.plot(min_samples_leafs, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("min samples leaf")
plt.show()

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

max_features = list(range(1,X_train.shape[1]))
x_train,x_test = X_train,X_test

train_results = []
test_results = []

for max_feature in max_features:
    dt = DecisionTreeClassifier(max_features=max_feature)
    dt.fit(x_train, y_train)
    train_pred = dt.predict(x_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(x_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_features, train_results, "b", label="Train AUC")
line2, = plt.plot(max_features, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("Max Featues")
plt.show()

In [None]:
from sklearn.svm import SVC

clf_svm = SVC(decision_function_shape='oyo',gamma='scale')
clf_svm.fit(X_train,y_train)

In [None]:
svm_pred = clf_svm.predict(X_train)
accuracy_score(y_train,svm_pred)*100

In [None]:
svm_pred = clf_svm.predict(X_test)
accuracy_score(y_test,svm_pred)*100

In [None]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial',max_iter=129).fit(X_train, y_train)

In [None]:
lr_pred = clf_lr.predict(X_train)
accuracy_score(y_train,lr_pred)*100

In [None]:
lr_pred = clf_lr.predict(X_test)
accuracy_score(y_test,lr_pred)*100

## Test score for leaderboard

In [17]:
test = pd.read_csv('testms.csv')
temp = test.Age
temp = temp.replace(to_replace=-1,value=22)
test.Age = temp

test['work_interfere']=test['work_interfere'].fillna("Maybe")
test['self_employed']=test['self_employed'].fillna("Dont Know")

In [18]:
#Validate test gender
temp = test.Gender
temp = temp.replace(to_replace=["Male","male","m","Malr","Male ","Cis Man"],value='M')
temp = temp.replace(to_replace=["female","F","Female","Woman","femail","f"],value='F')
temp = temp.replace(to_replace=["p","Female (trans)","ostensibly male, unsure what that really means"],value='T')
test.Gender = temp

In [19]:
tst_features = test.drop(columns=to_drop_columns)
print(tst_features.shape)
colmns=tst_features.columns
# colmns = colmns.drop(['Age','Timestamp','no_employee','leave'])
# colmns.remove('Age')
# colmns.remove('TimeStamp')
# colmns.remove('no_employee')
# colmns.remove('leave')
# test.columns
# test.columns

(259, 15)


In [None]:
colmns[12]

In [20]:
for column in colmns:
    tst_features = pd.concat([tst_features,pd.get_dummies(tst_features[column], prefix=column)],axis=1)
    tst_features.drop([column],axis=1, inplace=True)

In [None]:
tst_features.shape

In [21]:
pred = clf_entropy.predict(tst_features)

In [None]:
pred = clf_svm.predict(tst_features)

In [None]:
pred = clf_lr.predict(tst_features)

In [22]:
# Submission generation
index = [str(i) for i in range(1,len(pred)+1)]
values = ['Yes' if val==0 else 'No' for i,val in enumerate(pred)]
submission_result = pd.DataFrame(values, index=index,columns=['treatment'])
submission_result.index.name = "s.no"

file_name="submission.csv"
submission_result.to_csv(file_name, sep=',')