# Data download, preprocessing (Cleaning & Encoding)



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Large Dataset Handling") \
    .getOrCreate()

csv_file_path = "survey.csv"
train_df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

train_df.printSchema()
train_df.show(5)

train_pd_df = train_df.toPandas()


# there is no use of comments, state, Timestamp etc removing them

train_pd_df = train_pd_df.drop(['comments'], axis= 1)
train_pd_df = train_pd_df.drop(['state'], axis= 1)
train_pd_df = train_pd_df.drop(['Timestamp'], axis= 1)


train_pd_df.head(5)


# Assign default values for each data type

defaultInt = 0
defaultString = 'NaN'
defaultFloat = 0.0

# Create lists by data type

intFeatures = ['Age']
stringFeatures = ['Gender', 'Country', 'self_employed', 'family_history', 'treatment', 'work_interfere',
                 'no_employees', 'remote_work', 'tech_company', 'anonymity', 'leave', 'mental_health_consequence',
                 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview',
                 'mental_vs_physical', 'obs_consequence', 'benefits', 'care_options', 'wellness_program',
                 'seek_help']
floatFeatures = []

# Clean the NaN's

for feature in train_pd_df:
    if feature in intFeatures:
        train_pd_df[feature] = train_pd_df[feature].fillna(defaultInt)
    elif feature in stringFeatures:
        train_pd_df[feature] = train_pd_df[feature].fillna(defaultString)
    elif feature in floatFeatures:
        train_pd_df[feature] = train_pd_df[feature].fillna(defaultFloat)
    else:
        print('Error: Feature %s not recognized.' % feature)
train_pd_df.head(5)   

#clean 'Gender'
#Slower case all columm's elements
gender = train_pd_df['Gender'].str.lower()
#print(gender)

#Select unique elements
gender = train_pd_df['Gender'].unique()

#Made gender groups
male_str = ["male", "m", "male-ish", "maile", "mal", "male (cis)", "make", "male ", "man","msle", "mail", "malr","cis man", "Cis Male", "cis male"]
trans_str = ["trans-female", "something kinda male?", "queer/she/they", "non-binary","nah", "all", "enby", "fluid", "genderqueer", "androgyne", "agender", "male leaning androgynous", "guy (-ish) ^_^", "trans woman", "neuter", "female (trans)", "queer", "ostensibly male, unsure what that really means"]           
female_str = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)", "femail"]

for (row, col) in train_pd_df.iterrows():

    if str.lower(col.Gender) in male_str:
        train_pd_df['Gender'].replace(to_replace=col.Gender, value='male', inplace=True)

    if str.lower(col.Gender) in female_str:
        train_pd_df['Gender'].replace(to_replace=col.Gender, value='female', inplace=True)

    if str.lower(col.Gender) in trans_str:
        train_pd_df['Gender'].replace(to_replace=col.Gender, value='trans', inplace=True)

#Getting rid of gender's other than male, female, trans
stk_list = ['A little about you', 'p']
train_pd_df = train_pd_df[~train_pd_df['Gender'].isin(stk_list)]

print(train_pd_df['Gender'].unique())

#complete missing age with mean
train_pd_df['Age'].fillna(train_pd_df['Age'].median(), inplace = True)

# Fill with media() values < 18 and > 120

s = pd.Series(train_pd_df['Age'])
s[s<18] = train_pd_df['Age'].median()
train_pd_df['Age'] = s
s = pd.Series(train_pd_df['Age'])
s[s>120] = train_pd_df['Age'].median()
train_pd_df['Age'] = s

#Ranges of Age
train_pd_df['age_range'] = pd.cut(train_pd_df['Age'], [0,20,30,65,100], labels=["0-20", "21-30", "31-65", "66-100"], include_lowest=True)

#Replace self_employed= "NaN"  to NA
train_pd_df['self_employed'] = train_pd_df['self_employed'].replace([defaultString], 'No')
print(train_pd_df['self_employed'].unique())

#Replace work_interface= "NaN"  to NA


train_pd_df['work_interfere'] = train_pd_df['work_interfere'].replace([defaultString], 'Don\'t know' )
print(train_pd_df['work_interfere'].unique())

#Encoding data

labelDict = {}
for feature in train_pd_df:
    le = preprocessing.LabelEncoder()
    le.fit(train_pd_df[feature])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    train_pd_df[feature] = le.transform(train_pd_df[feature])
    # Get labels
    labelKey = 'label_' + feature
    labelValue = [*le_name_mapping]
    labelDict[labelKey] =labelValue
    
for key, value in labelDict.items():     
    print(key, value)

# Dropping 'country' as we already have label_country

train_pd_df = train_pd_df.drop(['Country'], axis= 1)
train_pd_df.head()

# checking missing data %

total = train_pd_df.isnull().sum().sort_values(ascending=False)
percent = (train_pd_df.isnull().sum()/train_pd_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
print(missing_data)

# EDA and data visualization

In [None]:
spark_df = spark.createDataFrame(train_pd_df)
clean_data_file_path = "cleaned_data.csv"
spark_df.write.csv(clean_data_file_path, mode='overwrite')

#correlation matrix
corrmat = train_pd_df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
plt.show()


k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'treatment')['treatment'].index
cm = np.corrcoef(train_pd_df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()


In [None]:
# Distribiution and density by Age
plt.figure(figsize=(12,8))
sns.distplot(train_pd_df["Age"], bins=24)
plt.title("Distribuition and density by Age")
plt.xlabel("Age")

In [None]:
# Separate by treatment or not

g = sns.FacetGrid(train_pd_df, col='treatment', height=5)
g = g.map(sns.distplot, "Age")

In [None]:
# Let see how many people has been treated

plt.figure(figsize=(12,8))
new_labels = ['Men', 'Women']
g = sns.countplot(x="treatment", data=train_pd_df)
g.set_xticklabels(new_labels)

plt.title('Total Distribuition by treated or not')

In [None]:
#barplot to show probabilities for class and sex

o = labelDict['label_age_range']

g = sns.barplot(x="age_range", y="treatment", hue="Gender", data=train_pd_df,  ci=None )
g.set_xticklabels(o)

plt.title('Probability of mental health condition')
plt.ylabel('Probability x 100')
plt.xlabel('Age')
# replace legend labels

new_labels = labelDict['label_Gender']
legend = g.get_legend()

for t, l in zip(legend.texts, new_labels):
    t.set_text(l)

plt.show()


# Plot of family history vs gender
o = labelDict['label_family_history']
g = sns.barplot(x="family_history", y="treatment", hue="Gender", data=train_pd_df, ci=None)
g.set_xticklabels(o)
plt.title('Probability of mental health condition')
plt.ylabel('Probability x 100')
plt.xlabel('Family History')

# replace legend labels
new_labels = labelDict['label_Gender']
legend = g.get_legend()

for t, l in zip(legend.texts, new_labels):
    t.set_text(l)

plt.show()

# Plot of care options vs gender

o = labelDict['label_care_options']
g = sns.barplot(x="care_options", y="treatment", hue="Gender", data=train_pd_df, ci=None)
g.set_xticklabels(o)
plt.title('Probability of mental health condition')
plt.ylabel('Probability x 100')
plt.xlabel('Care options')

new_labels = labelDict['label_Gender']
legend = g.get_legend()

for t, l in zip(legend.texts, new_labels):
    t.set_text(l)

plt.show()


# Plot of benefits vs gender 

o = labelDict['label_benefits']
g = sns.barplot(x="care_options", y="treatment", hue="Gender", data=train_pd_df, ci=None)
g.set_xticklabels(o)
plt.title('Probability of mental health condition')
plt.ylabel('Probability x 100')
plt.xlabel('Benefits')

# replace legend labels
new_labels = labelDict['label_Gender']
legend = g.get_legend()

for t, l in zip(legend.texts, new_labels):
    t.set_text(l)

plt.show()


# Plot of work interference vs gender

o = labelDict['label_work_interfere']
g = sns.barplot(x="work_interfere", y="treatment", hue="Gender", data=train_pd_df, ci=None)
g.set_xticklabels(o)
plt.title('Probability of mental health condition')
plt.ylabel('Probability x 100')
plt.xlabel('Work interfere')

# replace legend labels
new_labels = labelDict['label_Gender']
legend = g.get_legend()

for t, l in zip(legend.texts, new_labels):
    t.set_text(l)

plt.show()


In [None]:
# Scaling Age

scaler = MinMaxScaler()
train_pd_df['Age'] = scaler.fit_transform(train_pd_df[['Age']])
train_pd_df.head()

In [None]:
spark_df = spark.createDataFrame(train_pd_df)
parquet_file_path = "processed_data.parquet"
spark_df.write.parquet(parquet_file_path, mode='overwrite')




In [None]:
train_df = spark.read.parquet("processed_data.parquet").toPandas()

# AutoML with TPOT

In [None]:
# Splitting the dataset

# define X and y
feature_cols = ['Age', 'Gender', 'family_history', 'benefits', 'care_options', 'anonymity', 'leave', 'work_interfere']
X = train_df[feature_cols]
y = train_df.treatment

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Create dictionaries for final graph
# Use: methodDict['Stacking'] = accuracy_score
methodDict = {}
rmseDict = ()

In [None]:
# Computing feature importance

forest = ExtraTreesClassifier(n_estimators=250,random_state=0)

forest.fit(X, y)

importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
indices = np.argsort(importances)[::-1]

labels = []
for f in range(X.shape[1]):
    labels.append(feature_cols[f])      
    
# Plot the feature importances of the forest
plt.figure(figsize=(12,8))
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), labels, rotation='vertical')
plt.xlim([-1, X.shape[1]])
plt.show()


In [None]:
pip install TPOT

In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Step 3: Configure TPOT
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)

# Step 4: Fit TPOT on the training data
tpot.fit(X_train, y_train)

# Step 5: Export the best model
tpot.export('tpot_best_model.py')

# Step 6: Make predictions on the test set
y_pred = tpot.predict(X_test)

# Step 7: Evaluate the performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))