In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn import tree
import pickle
from sklearn.svm import SVC

In [None]:
df = pd.read_csv('trainms.csv')

In [None]:
df['work_interfere']=df['work_interfere'].fillna("Maybe")
df['self_employed']=df['self_employed'].fillna("Dont Know")

temp = df.Gender
temp = temp.replace(to_replace=["A little about you","something kinda male?","Enby","fluid","Genderqueer","Trans-female","Nah","Androgyne","Agender","male leaning androgynous","Trans woman","Neuter","Female (trans)","queer","non-binary"],value="T")
temp = temp.replace(["Female","female","Cis Female","F","Woman","f","Femake","Female ","woman","cis-female/femme","Female (cis)"],'F')
temp = temp.replace(["M","Male","male","m","Male-ish","maile","Cis Male","Mal","Male ","Make","Guy (-ish) ^_^","Man","msle","Mail","cis male"],'M')
temp.unique()
df.Gender = temp

to_drop_columns = ['state','comments','s.no']+['Age','Timestamp','no_employees','leave']+['supervisor','coworkers','Country','anonymity','self_employed'] #+['Country','supervisor','coworkers','phys_health_interview','anonymity','self_employed','remote_work','tech_company','obs_consequence','phys_health_consequence','seek_help','Gender']

In [None]:
features = df.drop(columns=to_drop_columns)
target = features.pop('treatment')
colmns = features.columns

In [None]:
for column in colmns:
    features = pd.concat([features,pd.get_dummies(features[column], prefix=column)],axis=1)
    features.drop([column],axis=1, inplace=True)
    
labels,_ = pd.factorize(target)

In [None]:
X_train, X_test, y_train, y_test = train_test_split( features, labels, test_size = 0.3, random_state = 100)

In [None]:
clf_svm = SVC(decision_function_shape='ovo',gamma='auto',C=10,kernel='poly',degree=5)
clf_svm.fit(X_train,y_train)

In [None]:
scores = cross_val_score(clf_svm, X_train, y_train, cv=10)
scores

In [None]:
svm_pred_train = clf_svm.predict(X_train)
accuracy_score(y_train,svm_pred_train)*100

In [None]:
y_train.shape,svm_pred.shape

pd.crosstab(y_train, svm_pred_train, rownames=['Actual'], colnames=['Predicted'])

In [None]:
svm_pred_test = clf_svm.predict(X_test)
accuracy_score(y_test,svm_pred_test)*100

In [None]:
pd.crosstab(y_test, svm_pred_test, rownames=['Actual'], colnames=['Predicted'])

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

max_values = list(range(1,50))
x_train,x_test = X_train,X_test

train_results = []
test_results = []

for max_value in max_values:
    dt = SVC(decision_function_shape='ovo',gamma='auto',C=max_value,kernel='poly',degree=5)
    dt.fit(x_train, y_train)
    train_pred = dt.predict(x_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    train_results.append(roc_auc)
    y_pred = dt.predict(x_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_values, train_results, "b", label="Train AUC")
line2, = plt.plot(max_values, test_results, "r", label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel("AUC score")
plt.xlabel("Max Featues")
plt.show()

In [None]:
test = pd.read_csv('testms.csv')
temp = test.Age
temp = temp.replace(to_replace=-1,value=22)
test.Age = temp

test['work_interfere']=test['work_interfere'].fillna("Maybe")
test['self_employed']=test['self_employed'].fillna("Dont Know")

In [None]:
#Validate test gender
temp = test.Gender
temp = temp.replace(to_replace=["Male","male","m","Malr","Male ","Cis Man"],value='M')
temp = temp.replace(to_replace=["female","F","Female","Woman","femail","f"],value='F')
temp = temp.replace(to_replace=["p","Female (trans)","ostensibly male, unsure what that really means"],value='T')
test.Gender = temp

In [None]:
tst_features = test.drop(columns=to_drop_columns)
colmns=tst_features.columns

In [None]:
for column in colmns:
    tst_features = pd.concat([tst_features,pd.get_dummies(tst_features[column], prefix=column)],axis=1)
    tst_features.drop([column],axis=1, inplace=True)

In [None]:
pred = clf_svm.predict(tst_features)

In [None]:
# Submission generation
index = [str(i) for i in range(1,len(pred)+1)]
values = ['Yes' if val==0 else 'No' for i,val in enumerate(pred)]
submission_result = pd.DataFrame(values, index=index,columns=['treatment'])
submission_result.index.name = "s.no"

file_name="submission.csv"
submission_result.to_csv(file_name, sep=',')