# Hackathon

# Applying all operation on Training Data

In [None]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

# import warnings
import warnings
warnings.filterwarnings("ignore")

# label encoding
from sklearn import preprocessing

# scaling data
from sklearn.preprocessing import StandardScaler

# train test split
from sklearn.model_selection import train_test_split

# Evaluation matrix
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline

# importing pickle for saving model
import pickle

In [None]:
training=pd.read_csv(r"data\training.csv",index_col=0,header=0)
print(training.shape)
training.head()

In [None]:
training.info()

In [None]:
training.describe(include="all")

In [None]:
training.drop(["Timestamp","Country","self_employed","state","comments"],axis=1,inplace=True)

In [None]:
training.columns

In [None]:
for i in training.columns:
    if training[i].dtypes!="int64":
        print(i,":")
        print(training[i].value_counts())
        print()

In [None]:
for i in training.columns:
    if training[i].dtypes!="int64":
        print(i,":")
        print(training[i].unique())
        print()

In [None]:
training=training.replace(["Male","male","M","m","Make","Man","Cis Male","Malr","something kinda male?",
                           "Guy (-ish) ^_^","maile","Malr","male leaning androgynous","Male (CIS)","Male-ish",
                           "Mal","cis male","Mail","msle","Male "], "Male")

In [None]:
training=training.replace(["Female","female","Trans-female","Cis Female","F","Woman","f","queer/she/they",
                           "Femake","woman","Genderqueer","Female  leaning androgynous","cis-female/femme","Trans woman",
                           "Female (trans)","queer","Female (cis)","Female "] ,"Female")

In [None]:
training=training.replace(["non-binary","Nah","All","Enby","fluid","Androgyne","Agender","Neuter",
                           "A little about you"] ,np.nan)

In [None]:
for i in training.columns:
    if training[i].dtypes!="int64":
        print(i,":")
        print(training[i].unique())
        print()

In [None]:
training.isnull().sum()

In [None]:
training["Gender"].fillna(training["Gender"].mode()[0],inplace=True)
# training["self_employed"].fillna(training["self_employed"].mode()[0],inplace=True)
training["work_interfere"].fillna(training["work_interfere"].mode()[0],inplace=True)
# training["state"].fillna(training["state"].mode()[0],inplace=True)

In [None]:
training.isnull().sum()

In [None]:
training.columns

## Pre-processing

In [None]:
# For preprocessing the data
le=preprocessing.LabelEncoder()

colname=['Gender', 'family_history', 'work_interfere',
       'no_employees', 'remote_work', 'tech_company', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'treatment']

for x in colname:
    training[x]=le.fit_transform(training[x])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print('Feature', x)
    print('mapping', le_name_mapping)
    print()

In [None]:
training.dtypes

In [None]:
training.head()

In [None]:
training.shape

## Data Visualization

In [None]:
training.boxplot(column="Age")

In [None]:
print(training.Age.min())
print(training.Age.max())

## Outlier imputation

In [None]:
#for value in colname:
q1 = training['Age'].quantile(0.25) #first quartile value
q3 = training['Age'].quantile(0.75) # third quartile value
iqr = q3-q1 #Interquartile range
low  = q1-1.5*iqr #acceptable range
high = q3+1.5*iqr #acceptable range

training_include = training.loc[(training['Age'] >= low) & \
                                (training['Age'] <= high)] # meeting the acceptable range
training_exclude = training.loc[(training['Age'] < low) | (training['Age'] > high)] #not meeting the acceptable range

print(training_include.shape)
print(training_exclude.shape)

print(low)

Age_mean=int(training_include.Age.mean()) #finding the mean of the acceptable range
print(Age_mean)

#imputing outlier values with mean value
training_exclude.Age=Age_mean

#getting back the original shape of df
training_rev=pd.concat([training_include,training_exclude],axis=0) #concatenating both dfs to get 
#the original shape
print(training_rev.shape)

training_rev.boxplot(column=["Age"])
plt.show()

## Creating X and Y

In [None]:
X = training_rev.values[:,0:-1]
Y = training_rev.values[:,-1]

In [None]:
scaler = StandardScaler()

scaler.fit(X)

X = scaler.transform(X)
print(X)

## Split the data into test and train

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

## Logistic 

In [None]:
classifier=LogisticRegression()

classifier.fit(X_train,Y_train)

Y_pred=classifier.predict(X_test)
print(list(zip(Y_test,Y_pred)))

In [None]:
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)

print("Classification report: ")
print(classification_report(Y_test,Y_pred))

acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

## Tune Logistic model

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
classifier_tune=LogisticRegression()

classifier_tune.fit(X_train,Y_train)

Y_pred=classifier_tune.predict(X_test)

print(list(zip(Y_test,Y_pred)))
print()
print(list(zip(training.columns[:-1],classifier.coef_.ravel())))

In [None]:
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)

print("Classification report: ")
print(classification_report(Y_test,Y_pred))

acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

In [None]:
# store the predicted probabilities
y_pred_prob = classifier_tune.predict_proba(X_test)
print(y_pred_prob)

In [None]:
for a in np.arange(0.4,0.61,0.01):
    predict_mine = np.where(y_pred_prob[:,1] > a, 1, 0)
    cfm=confusion_matrix(Y_test, predict_mine)
    total_err=cfm[0,1]+cfm[1,0]
    print("Errors at threshold ", a, ":",total_err, " , type 2 error :", 
          cfm[1,0]," , type 1 error:", cfm[0,1])

## AUC

In [None]:
from sklearn import metrics

fpr, tpr, z = metrics.roc_curve(Y_test, y_pred_prob[:,1])
auc = metrics.auc(fpr,tpr)

print(auc)

In [None]:
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr,tpr, 'b', label = auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.show()

## SGD Classifier

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
classifier_SGD = SGDClassifier(
    loss="log_loss",
    random_state=10,
    alpha=0.01,
    max_iter=1000,
    shuffle=True,
    early_stopping=True,
    n_iter_no_change=3,
)

# Fit the classifier to your training data
classifier_SGD.fit(X_train, Y_train)

# Make predictions on test data
Y_pred = classifier_SGD.predict(X_test)

# Print coefficients and intercept
print(list(zip(training.columns[:-1], classifier_SGD.coef_.ravel())))
print(classifier_SGD.intercept_)

In [None]:
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)

print("Classification report: ")
print(classification_report(Y_test,Y_pred))

acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

## Tune SGD Classifier

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
classifier_SGD_tune = SGDClassifier(
    loss="log_loss",
    random_state=10,
    alpha=0.01,
    max_iter=1000,
    shuffle=True,
    early_stopping=True,
    n_iter_no_change=7,
)

classifier_SGD_tune.fit(X_train, Y_train)

Y_pred = classifier_SGD_tune.predict(X_test)

In [None]:
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)

acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

print("Classification report: ")
print(classification_report(Y_test,Y_pred))

In [None]:
# store the predicted probabilities
y_pred_prob = classifier_SGD_tune.predict_proba(X_test)
print(y_pred_prob)

In [None]:
for a in np.arange(0.3,0.61,0.01):
    predict_mine = np.where(y_pred_prob[:,1] > a, 1, 0)
    cfm=confusion_matrix(Y_test, predict_mine)
    total_err=cfm[0,1]+cfm[1,0]
    print("Errors at threshold ", a, ":",total_err, " , type 2 error :", 
          cfm[1,0]," , type 1 error:", cfm[0,1])

In [None]:
y_pred_class=[]
for value in y_pred_prob[:,1]:
    if value > 0.45:
        y_pred_class.append(1)
    else:
        y_pred_class.append(0)
print(y_pred_class)

In [None]:
cfm=confusion_matrix(Y_test,y_pred_class)
print(cfm)
acc=accuracy_score(Y_test, y_pred_class)
print("Accuracy of the model: ",acc)
print(classification_report(Y_test, y_pred_class))

## KNN

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
model_KNN=KNeighborsClassifier(n_neighbors=int(np.sqrt(len(X_train))),metric='euclidean')

model_KNN.fit(X_train,Y_train)
Y_pred=model_KNN.predict(X_test)

In [None]:
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

In [None]:
int(np.sqrt(len(X_train)))

## Tune KNN

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
model_KNN_tune=KNeighborsClassifier(n_neighbors=23,metric='euclidean')

model_KNN_tune.fit(X_train,Y_train)
Y_pred=model_KNN_tune.predict(X_test)

In [None]:
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

In [None]:
my_dict={}
for K in range(1,60):
    model_KNN_tune = KNeighborsClassifier(n_neighbors=K,metric="minkowski")
    model_KNN_tune.fit(X_train, Y_train) 
    Y_pred = model_KNN_tune.predict(X_test)
    print ("Accuracy is ", accuracy_score(Y_test,Y_pred), "for K-Value:",K)
    my_dict[K]=accuracy_score(Y_test,Y_pred)

In [None]:
for k in my_dict:
    if my_dict[k]==max(my_dict.values()):
        print(k,":",my_dict[k])

## Decision Tree

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
model_DecisionTree=DecisionTreeClassifier(criterion="gini",random_state=10,splitter="best")

model_DecisionTree.fit(X_train,Y_train)
Y_pred=model_DecisionTree.predict(X_test)
print(Y_pred)

In [None]:
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

## Tune Decision Tree

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
model_DecisionTree_tune=DecisionTreeClassifier(criterion="gini",random_state=10,splitter="best",
                                         min_samples_leaf=15,max_depth=10,min_samples_split=2,
                                         max_leaf_nodes=200)

model_DecisionTree_tune.fit(X_train,Y_train)
Y_pred=model_DecisionTree_tune.predict(X_test)
print(Y_pred)

In [None]:
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

In [None]:
# store the predicted probabilities
y_pred_prob = model_DecisionTree_tune.predict_proba(X_test)
print(y_pred_prob)

In [None]:
for a in np.arange(0.3,0.61,0.01):
    predict_mine = np.where(y_pred_prob[:,1] > a, 1, 0)
    cfm=confusion_matrix(Y_test, predict_mine)
    total_err=cfm[0,1]+cfm[1,0]
    print("Errors at threshold ", a, ":",total_err, " , type 2 error :", 
          cfm[1,0]," , type 1 error:", cfm[0,1])

In [None]:
y_pred_class=[]
for value in y_pred_prob[:,1]:
    if value > 0.49:
        y_pred_class.append(1)
    else:
        y_pred_class.append(0)
print(y_pred_class)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cfm=confusion_matrix(Y_test,y_pred_class)
print(cfm)
acc=accuracy_score(Y_test, y_pred_class)
print("Accuracy of the model: ",acc)
print(classification_report(Y_test, y_pred_class))

## Random Forest

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
model_RandomForest=RandomForestClassifier(n_estimators=10, random_state=10)

model_RandomForest.fit(X_train,Y_train)

Y_pred=model_RandomForest.predict(X_test)

In [None]:
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

## Tune Random Forest

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
model_RandomForest_tune=RandomForestClassifier(n_estimators=12, random_state=10)

model_RandomForest_tune.fit(X_train,Y_train)

Y_pred=model_RandomForest_tune.predict(X_test)

In [None]:
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

In [None]:
# store the predicted probabilities
y_pred_prob = model_RandomForest_tune.predict_proba(X_test)
print(y_pred_prob)

In [None]:
for a in np.arange(0.3,0.61,0.01):
    predict_mine = np.where(y_pred_prob[:,1] > a, 1, 0)
    cfm=confusion_matrix(Y_test, predict_mine)
    total_err=cfm[0,1]+cfm[1,0]
    print("Errors at threshold ", a, ":",total_err, " , type 2 error :", 
          cfm[1,0]," , type 1 error:", cfm[0,1])

## SVC

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
svc_model=svm.SVC(kernel='rbf',C=10.0,gamma=0.001)

svc_model.fit(X_train, Y_train)

Y_pred=svc_model.predict(X_test)

print(list(Y_pred))

In [None]:
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

## Tune SVC

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
model_SVC_tune=svm.SVC(kernel="rbf", gamma=0.00001, C=200)

model_SVC_tune.fit(X_train,Y_train)

Y_pred=model_SVC_tune.predict(X_test)

In [None]:
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

## K-Fold Validation

In [None]:

model_DecisionTree_tune = DecisionTreeClassifier(
    criterion="gini",
    random_state=10,
    splitter="best",
    min_samples_leaf=15,
    max_depth=10,
    min_samples_split=2,
    max_leaf_nodes=200,
)

# Performing k-fold cross-validation
kfold_cv = KFold(n_splits=10, random_state=10, shuffle=True)

# Running the model using accuracy as the scoring metric
kfold_cv_result = cross_val_score(
    estimator=model_DecisionTree_tune, X=X_train, y=Y_train, cv=kfold_cv
)

# Print cross-validation results
print(kfold_cv_result)

# Find the mean accuracy
print(kfold_cv_result.mean())

In [None]:

# Define Logistic Regression classifier
classifier = LogisticRegression()

# Define KFold cross-validation with shuffle=True
kfold_cv = KFold(n_splits=15, shuffle=True, random_state=10)

# Run the model using accuracy as the scoring metric
kfold_cv_result = cross_val_score(
    estimator=classifier, X=X_train, y=Y_train, cv=kfold_cv
)

# Print cross-validation results
print(kfold_cv_result)

# Find the mean accuracy
print(kfold_cv_result.mean())

## Ensemble Model

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=10)

In [None]:
estimators = []
model1 = LogisticRegression()
estimators.append(('log', model1))


# model2=SGDClassifier(loss="log",random_state=10,alpha=0.1,max_iter=20, shuffle=True,early_stopping=True, 
#                      n_iter_no_change=20)
# estimators.append(('sgd', model2))



model3 = DecisionTreeClassifier(criterion="gini",random_state=10,splitter="best",min_samples_leaf=15,max_depth=10,
                                min_samples_split=2, max_leaf_nodes=200)
estimators.append(('cart', model3))



# model4 = svm.SVC(kernel="rbf", gamma=0.00001, C=200)
# estimators.append(('svm', model4))



# model5 = KNeighborsClassifier(n_neighbors=30,metric='euclidean')
# estimators.append(('knn', model5))



model6=RandomForestClassifier(n_estimators=50, random_state=10)
estimators.append(('rt', model6))



# create the ensemble model
ensemble = VotingClassifier(estimators)
ensemble.fit(X_train,Y_train)
Y_pred=ensemble.predict(X_test)
print(Y_pred)

In [None]:
print(confusion_matrix(Y_test,Y_pred))
print(accuracy_score(Y_test,Y_pred))
print(classification_report(Y_test,Y_pred))

# Applying all operation on Testing Data

In [None]:
test=pd.read_csv(r"data\test.csv",index_col=0,header=0)
print(test.shape)
test.head()

In [None]:
test.drop(["Timestamp","Country","self_employed","state","comments"],axis=1,inplace=True)

In [None]:
for i in test.columns:
    if test[i].dtypes!="int64":
        print(i,":")
        print(test[i].value_counts())
        print()

In [None]:
test=test.replace(["Male","male","M","m","Make","Man","Cis Male","Malr","something kinda male?",
                           "Guy (-ish) ^_^","maile","Malr","male leaning androgynous","Male (CIS)","Male-ish",
                           "Mal","cis male","Mail","msle","Male ","Cis Man",
                   "ostensibly male, unsure what that really means"], "Male")

In [None]:
test=test.replace(["Female","female","Trans-female","Cis Female","F","Woman","f","queer/she/they",
                           "Femake","woman","Genderqueer","Female  leaning androgynous","cis-female/femme","Trans woman",
                           "Female (trans)","queer","Female (cis)","Female ","femail"] ,"Female")

In [None]:
test=test.replace(["non-binary","Nah","All","Enby","fluid","Androgyne","Agender","Neuter",
                           "A little about you","p"] ,np.nan)

In [None]:
for i in test.columns:
    if test[i].dtypes!="int64":
        print(i,":")
        print(test[i].unique())
        print()

In [None]:
test.isnull().sum()

In [None]:
test["Gender"].fillna(test["Gender"].mode()[0],inplace=True)
# training["self_employed"].fillna(training["self_employed"].mode()[0],inplace=True)
test["work_interfere"].fillna(test["work_interfere"].mode()[0],inplace=True)
# training["state"].fillna(training["state"].mode()[0],inplace=True)

In [None]:
test.isnull().sum()

## Pre-processing

In [None]:
colname1=['Gender', 'family_history', 'work_interfere',
       'no_employees', 'remote_work', 'tech_company', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence']

for x in colname1:
    test[x]=le.fit_transform(test[x])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print('Feature', x)
    print('mapping', le_name_mapping)
    print()

## Data Visualization

In [None]:
test.boxplot(column="Age")

## Creating X

In [None]:
X_test_new = test.values[:,:]

In [None]:
X_test_new  = scaler.transform(X_test_new )
print(X_test_new)

In [None]:
# Define DecisionTreeClassifier with desired parameters
model_DecisionTree_tune = DecisionTreeClassifier(
    criterion="gini",
    random_state=10,
    splitter="best",
    min_samples_leaf=15,
    max_depth=10,
    min_samples_split=2,
    max_leaf_nodes=200,
)

# Fit the model with training data
model_DecisionTree_tune.fit(X_train, Y_train)

# Now, you can use the predict_proba method
Y_pred_prob = model_DecisionTree_tune.predict_proba(X_test_new)
print(Y_pred_prob)

In [None]:
y_pred_class=[]
for value in Y_pred_prob[:,1]:
    if value > 0.49:
        y_pred_class.append(1)
    else:
        y_pred_class.append(0)
print(y_pred_class)

## Importing in submission file

In [None]:
df1=pd.read_csv(r"data\sample.csv",header=0)
df1.head(10)

In [None]:
df1.columns

In [None]:
df1["treatment"]=y_pred_class

In [None]:
df1.head()

In [None]:
df1['treatment']=df1['treatment'].replace([0], "No")
df1['treatment']=df1['treatment'].replace([1], "Yes")

In [None]:
df1.head()

In [None]:
df1.columns

In [None]:
df1.to_csv(r"data\sample.csv",index=False,header=True)

## Dumping Models

In [None]:
## saving model:
pickle.dump(classifier,open(r'models\log.pkl','wb'))
pickle.dump(classifier_SGD,open(r'models\sgd.pkl','wb'))
pickle.dump(classifier_SGD_tune,open(r'models\sgd_tune.pkl','wb'))
pickle.dump(model_KNN,open(r'models\knn.pkl','wb'))
pickle.dump(model_KNN_tune,open(r'models\knn_tune.pkl','wb'))
pickle.dump(svc_model,open(r'models\svc.pkl','wb'))
pickle.dump(model_SVC_tune,open(r'models\svc_tune.pkl','wb'))
pickle.dump(model_DecisionTree,open(r'models\dt.pkl','wb'))
pickle.dump(model_DecisionTree_tune,open(r'models\dt_tune.pkl','wb'))
pickle.dump(model_RandomForest,open(r'models\rt.pkl','wb'))
pickle.dump(model_RandomForest_tune,open(r'models\rt_tune.pkl','wb'))
pickle.dump(ensemble,open(r'models\ensemble.pkl','wb'))