# Mental Health in the Tech Industry Data Gathering

Let's create a class for gathering the data, statistical calculation and processing data for presentation and visualization.

In [1]:
import sqlite3
import numpy as np

In [42]:
class DataProcessing:
    ''' Class for gathering the data, statistical calculation and processing data. '''
    
    def __init__(self, path, db_name):
        self.path = path
        self.db_name = db_name
    
    def get_table(self, table):
        ''' Get table content from the database. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"SELECT * FROM {table}")
        table = c.fetchall()
        conn.close()
        return table

    def get_all_answers_per_q(self, q_id):
        ''' Get all unique answers from Answer table for particular question 
            represented by QuestionID number from the Question table. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT AnswerText FROM Answer WHERE QuestionID = ?", (q_id,))
        answers = set(c.fetchall())
        conn.close()
        return answers
    
    def get_users_no_per_q(self, q_id):
        ''' Get number of all answers for the question. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT count(UserID) FROM Answer WHERE QuestionID = ?", (q_id,))
        user_no = c.fetchone()[0]
        conn.close()
        return user_no
    
    def get_users_no_for_q_and_answer(self, q_id, answer):
        ''' Get number of users from Answer table for particular question number (QuestionID) and answer (AnswerText). '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT count(UserID) FROM Answer WHERE QuestionID = ? and AnswerText = ?", (q_id, answer))
        user_no = c.fetchone()[0]
        conn.close()
        return user_no
    
    def get_users_no_per_answer(self, q_id):
        ''' Get frequency of the answers for particular question number (QuestionID) in Answer table. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT AnswerText, count(UserID) FROM Answer WHERE QuestionID = ? GROUP BY AnswerText", (q_id,))
        user_no = c.fetchall()
        conn.close()
        return user_no
    
    def get_table_based_value_from_column(self, table, column, value):
        ''' Get particular answer from provided table. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"SELECT * FROM {table} WHERE {column} = ?", (value,))
        value = c.fetchall()
        conn.close()
        return value

    def get_answers_for_q_less_occ(self, q_id, qty):
        ''' Get answers for provided question q_id where occurency is less than qty. '''
        q_id, qty = int(q_id), int(qty)
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f'SELECT AnswerText, count(UserID) as UNo FROM Answer WHERE QuestionID = ? GROUP BY AnswerText HAVING UNo < ?', 
                  (q_id, qty))
        answers = c.fetchall()
        conn.close()
        return answers

    def get_answers_for_q_greater_occ(self, q_id, qty):
        ''' Get answers for provided question q_id where occurency is greater than qty. '''
        q_id, qty = int(q_id), int(qty)
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f'SELECT AnswerText, count(UserID) as UNo FROM Answer WHERE QuestionID = ? GROUP BY AnswerText HAVING UNo > ?', 
                  (q_id, qty))
        answers = c.fetchall()
        conn.close()
        return answers
    
    def get_answers_for_q_in_year(self, q_id, year):
        ''' Get all answers from Answer table for particular question (QuestionID)
            in given year (SurveyID). '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT AnswerText FROM Answer WHERE QuestionID = ? and SurveyID = ?", (q_id, year))
        answers = set(c.fetchall())
        conn.close()
        return answers
    
    def get_users_no_for_q_and_answer_in_year(self, q_id, answer, year):
        ''' Get number of users from Answer table for particular question number (QuestionID) and answer (AnswerText)
            in given year (SurveyID). '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT count(UserID) FROM Answer WHERE QuestionID = ? and AnswerText = ? and SurveyID = ?", (q_id, answer, year))
        user_no = c.fetchone()[0]
        conn.close()
        return user_no
    
    def get_users_no_per_q_in_year(self, q_id, year):
        ''' Get number of all answers for the question in given year (SurveyID). '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT count(UserID) FROM Answer WHERE QuestionID = ? and SurveyID = ?", (q_id, year))
        user_no = c.fetchone()[0]
        conn.close()
        return user_no
    
    def get_answers_for_question(self, q_id):
        ''' Get answers content for given question. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"SELECT AnswerText FROM Answer WHERE QuestionID = ?", (q_id,))
        table = c.fetchall()
        conn.close()
        return table
    
    def get_answers_for_questions(self, q_ids):
        ''' Get answers content for given questions. '''
        values_no = ('?, '*len(q_ids))[:-2]
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"SELECT AnswerText FROM Answer WHERE QuestionID = {values_no}", (*q_ids))
        table = c.fetchall()
        conn.close()
        return table
    
    def get_table_for_questions(self, q_ids):
        ''' Get table content for given questions. '''
        values_no = ('?, '*len(q_ids))[:-2]
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"SELECT * FROM Answer WHERE QuestionID = {values_no}", (*q_ids))
        table = c.fetchall()
        conn.close()
        return table
    
    def get_different_answers_for_2q(self, q_id1, q_id2):
        ''' Get different answers on two questions for the same user. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"""SELECT AnswerText, UserID as uid FROM Answer 
        WHERE QuestionID = ? and AnswerText != (SELECT AnswerText FROM Answer WHERE UserID = uid and QuestionID = ?)""",
                 (q_id1, q_id2))
        table = c.fetchall()
        conn.close()
        return table
    
    def get_different_answers_for_2q_v2(self, q_id1, q_id2):
        ''' Get different answers on two questions for the same user.
            Much faster than get_different_answers_for_2q. '''
        tab_q1 = self.get_answers_ordered_by_uid(q_id1)
        tab_q2 = self.get_answers_ordered_by_uid(q_id2)
        diff_table = self._get_diff_table(tab_q1, tab_q2)
        return self._get_combined_tab_uid_based(diff_table, tab_q1)
        
    def get_answers_ordered_by_uid(self, q_id):
        ''' Get the answers ordered by UserID for given question. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"SELECT AnswerText, UserID FROM Answer WHERE QuestionID = ? Order by UserID", (q_id,))
        table = c.fetchall()
        conn.close()
        return table
    
    @staticmethod
    def _get_diff_table(tab1, tab2):
        ''' Returns the difference between two lists. '''
        table1 = set(tab1)
        table2 = set(tab2)
        return table2.difference(table1)
    
    @staticmethod
    def _get_combined_tab_uid_based(tab1, tab2):
        ''' Returns a list which combine first values from lists of tuples tab1 and tab2
            based on the second values.
            tab1 and tab2 need to have values of tuples of length 2. '''
        result_list = []
        for wc, wuid in list(tab1):
            for lc, luid in list(tab2):
                if luid == wuid:
                    result_list.append((lc, wc, wuid))
        return result_list    
    
    def get_users_no_per_two_q(self, q_id1, q_id2):
        ''' Get number of users who answered for provided two questions. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"""SELECT count(UserID), UserID as uid 
        FROM Answer WHERE QuestionID = ? and 
        EXISTS(SELECT AnswerText FROM Answer WHERE UserID = uid and QuestionID = ?)""",
                 (q_id1, q_id2))
        user_no = c.fetchone()[0]
        conn.close()
        return user_no
    
    def get_all_answers_based_on_answer_and_q(self, q_id, answer):
        ''' Get all answers for all users which answered on given question (q_id) in particular way (answer). '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"""SELECT * FROM Answer 
                  WHERE UserID in (SELECT UserID FROM Answer WHERE AnswerText = ? and QuestionID = ?) 
                  Order by UserID""", (answer, q_id))
        table = c.fetchall()
        conn.close()
        return table
    
    def get_some_answers_based_on_answer_and_q(self, q_ids, q_id, answer):
        ''' Get all answers for all users which answered on given question (q_id) in particular way (answer). '''
        values_no = ('?, '*len(q_ids))[:-2]
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"""SELECT * FROM Answer 
                  WHERE QuestionID in ({values_no}) 
                  and UserID in (SELECT UserID FROM Answer WHERE AnswerText = ? and QuestionID = ?) 
                  Order by UserID""", (*q_ids, answer, q_id))
        table = c.fetchall()
        conn.close()
        return table
    
    def get_answers_distribution_for_q(self, q_id):
        ''' Get users percentage per answer for given question. '''
        answers = self.get_users_no_per_answer(q_id)
        return self._change_total_to_pct(answers)      
        
    def _change_total_to_pct(self, tab):
        ''' Change input table of answers with total occurrence to the percentage representation.
            Only table with (answer, count) is supported. '''
        sum_of_answers = sum([no for _, no in tab])
        table = []
        for answer, value in tab:
            table.append((answer, self._round_2(100*value/sum_of_answers)))
        return table
        
    @staticmethod
    def _round_2(x):
        return round(x , 2)
    
    def get_avg_for_quantitative_q(self, q_id):
        ''' Get average for given quantitative question (q_id). '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f'SELECT avg(AnswerText) FROM Answer WHERE QuestionID = ?', (q_id,))
        avg = c.fetchone()[0]
        conn.close()
        return self._round_2(avg)
    
    def get_hist_for_quantitative_q(self, q_id):
        answers = self.get_answers_for_question(q_id)
        return self._get_hist(answers)
    
    @staticmethod
    def _get_hist(answers):
        data = [int(answer[0]) for answer in answers]
        return np.histogram(data)
        

In [43]:
test = DataProcessing('DB/', 'mental_health')

In [76]:
test.get_users_no_for_q_and_answer(2, 'Male')

3044

In [77]:
test.get_users_no_per_answer(2)

[('Female', 1024),
 ('Male', 3044),
 ('Non-binary', 13),
 ('Other', 107),
 ('n/a', 24),
 ('non-binary', 6)]

In [78]:
test.get_users_no_per_q(2)

4218

In [79]:
test.get_table_based_value_from_column('Answer', 'AnswerText', 'Idaho')

[('Idaho', 2014, 420, 4),
 ('Idaho', 2016, 1549, 4),
 ('Idaho', 2016, 2364, 4),
 ('Idaho', 2016, 2663, 4),
 ('Idaho', 2016, 2364, 51),
 ('Idaho', 2016, 2663, 51),
 ('Idaho', 2017, 2838, 4),
 ('Idaho', 2017, 3419, 4),
 ('Idaho', 2017, 2838, 51),
 ('Idaho', 2017, 3419, 51),
 ('Idaho', 2018, 3612, 4),
 ('Idaho', 2018, 3612, 51),
 ('Idaho', 2019, 4197, 4),
 ('Idaho', 2019, 4197, 51)]

In [80]:
test.get_answers_for_q_less_occ(2, 10)

[('non-binary', 6)]

In [81]:
test.get_answers_for_q_greater_occ(2, '10')

[('Female', 1024),
 ('Male', 3044),
 ('Non-binary', 13),
 ('Other', 107),
 ('n/a', 24)]

In [82]:
%%time
test.get_different_answers_for_2q(3, 50)

Wall time: 41 s


[('United States of America', 1511),
 ('Spain', 1566),
 ('Canada', 1569),
 ('United Kingdom', 1595),
 ('United States of America', 1647),
 ('Canada', 1688),
 ('Canada', 1702),
 ('Netherlands', 1729),
 ('Lithuania', 1777),
 ('Algeria', 1858),
 ('Pakistan', 1983),
 ('Australia', 2045),
 ('United Kingdom', 2058),
 ('Other', 2081),
 ('Romania', 2095),
 ('Japan', 2152),
 ('France', 2179),
 ('Canada', 2288),
 ('France', 2335),
 ('Germany', 2359),
 ('Italy', 2440),
 ('Italy', 2449),
 ('Taiwan', 2626),
 ('Afghanistan', 2668),
 ('Canada', 2683),
 ('United States of America', 2690),
 ('Czech Republic', 2772),
 ('Belgium', 2789),
 ('Germany', 2880),
 ('Brazil', 2883),
 ('Hungary', 2884),
 ('Greece', 2912),
 ('Ireland', 2928),
 ('Switzerland', 2954),
 ('India', 3031),
 ('Argentina', 3074),
 ('Belarus', 3120),
 ('Netherlands', 3134),
 ('France', 3140),
 ('-1', 3447),
 ('-1', 3449),
 ('Poland', 3547),
 ('New Zealand', 3683),
 ('Belgium', 3705),
 ('Germany', 3736),
 ('France', 3760),
 ('Japan', 3807)

In [83]:
%%time
test.get_different_answers_for_2q_v2(3, 50)

Wall time: 89 ms


[('Netherlands', 'United Kingdom', 1729),
 ('Canada', 'United States of America', 1702),
 ('Poland', 'United States of America', 3547),
 ('United States of America', 'Mexico', 3847),
 ('Romania', 'United States of America', 2095),
 ('Japan', 'Canada', 2152),
 ('Greece', 'Netherlands', 2912),
 ('India', 'United States of America', 3031),
 ('-1', 'n/a', 3449),
 ('France', 'United Kingdom', 2179),
 ('Spain', 'United States of America', 1566),
 ('Germany', 'United Kingdom', 2359),
 ('Italy', 'United States of America', 2440),
 ('Other', 'Germany', 2081),
 ('United Kingdom', 'Other', 3911),
 ('Canada', 'United States of America', 2683),
 ('Belarus', 'United States of America', 3120),
 ('Germany', 'Eritrea', 2880),
 ('Switzerland', 'Swaziland', 2954),
 ('Lithuania', 'United Kingdom', 1777),
 ('France', 'Switzerland', 3760),
 ('Belgium', 'Portugal', 2789),
 ('Taiwan', 'United States of America', 2626),
 ('Hungary', 'Austria', 2884),
 ('Algeria', 'United States of America', 1858),
 ('Netherlan

In [20]:
test.get_all_answers_based_on_answer_and_q(2, 'Female')

[('37', 2014, 1, 1),
 ('Female', 2014, 1, 2),
 ('United States of America', 2014, 1, 3),
 ('Illinois', 2014, 1, 4),
 ('n/a', 2014, 1, 5),
 ('No', 2014, 1, 6),
 ('1', 2014, 1, 7),
 ('6-25', 2014, 1, 8),
 ('1', 2014, 1, 9),
 ('Yes', 2014, 1, 10),
 ('Yes', 2014, 1, 11),
 ('No', 2014, 1, 12),
 ('No', 2014, 1, 90),
 ('Yes', 2014, 1, 91),
 ('Often', 2014, 1, 92),
 ('No', 2014, 1, 93),
 ('Not sure', 2014, 1, 94),
 ('No', 2014, 1, 95),
 ('Yes', 2014, 1, 96),
 ('Somewhat easy', 2014, 1, 97),
 ('No', 2014, 1, 98),
 ('Some of them', 2014, 1, 99),
 ('Yes', 2014, 1, 100),
 ('Maybe', 2014, 1, 101),
 ('No', 2014, 1, 102),
 ('-1', 2014, 1, 103),
 ('35', 2014, 7, 1),
 ('Female', 2014, 7, 2),
 ('United States of America', 2014, 7, 3),
 ('Michigan', 2014, 7, 4),
 ('n/a', 2014, 7, 5),
 ('Yes', 2014, 7, 6),
 ('1', 2014, 7, 7),
 ('1-5', 2014, 7, 8),
 ('1', 2014, 7, 9),
 ('No', 2014, 7, 10),
 ('No', 2014, 7, 11),
 ('No', 2014, 7, 12),
 ('Maybe', 2014, 7, 90),
 ("Don't know", 2014, 7, 91),
 ('Sometimes', 2014

In [32]:
test.get_some_answers_based_on_answer_and_q([2, 3, 4, 5], 2, 'Female')

[('Female', 2014, 1, 2),
 ('United States of America', 2014, 1, 3),
 ('Illinois', 2014, 1, 4),
 ('n/a', 2014, 1, 5),
 ('Female', 2014, 7, 2),
 ('United States of America', 2014, 7, 3),
 ('Michigan', 2014, 7, 4),
 ('n/a', 2014, 7, 5),
 ('Female', 2014, 9, 2),
 ('United States of America', 2014, 9, 3),
 ('Illinois', 2014, 9, 4),
 ('n/a', 2014, 9, 5),
 ('Female', 2014, 13, 2),
 ('United States of America', 2014, 13, 3),
 ('California', 2014, 13, 4),
 ('n/a', 2014, 13, 5),
 ('Female', 2014, 16, 2),
 ('United States of America', 2014, 16, 3),
 ('Illinois', 2014, 16, 4),
 ('n/a', 2014, 16, 5),
 ('Female', 2014, 28, 2),
 ('United States of America', 2014, 28, 3),
 ('California', 2014, 28, 4),
 ('0', 2014, 28, 5),
 ('Female', 2014, 35, 2),
 ('United States of America', 2014, 35, 3),
 ('Wisconsin', 2014, 35, 4),
 ('0', 2014, 35, 5),
 ('Female', 2014, 46, 2),
 ('United States of America', 2014, 46, 3),
 ('Texas', 2014, 46, 4),
 ('0', 2014, 46, 5),
 ('Female', 2014, 62, 2),
 ('Poland', 2014, 62, 

In [73]:
test.get_answers_distribution_for_q(7)

[('0', 42.82), ('1', 57.18)]

In [4]:
test.get_answers_distribution_for_q(8)

[('1-5', 6.02),
 ('100-500', 18.68),
 ('26-100', 19.54),
 ('500-1000', 5.86),
 ('6-25', 16.33),
 ('More than 1000', 21.62),
 ('n/a', 11.95)]

In [5]:
test.get_answers_distribution_for_q(54)

[('No', 0.88),
 ('Not applicable to me', 1.72),
 ('Unsure', 2.03),
 ('Yes', 12.41),
 ('n/a', 82.96)]

In [14]:
test.get_avg_for_quantitative_q(1)

33.88

In [44]:
test.get_hist_for_quantitative_q(1)

(array([ 735, 1749, 1139,  415,  125,   34,    5,    0,    0,    1],
       dtype=int64),
 array([18. , 26.1, 34.2, 42.3, 50.4, 58.5, 66.6, 74.7, 82.8, 90.9, 99. ]))

In [57]:
for q_id in [6, 7, 9, 28, 30, 32, 33, 54, 78]:
    print(f'Question {q_id}')
    for anspct, anssum in zip(test.get_answers_distribution_for_q(q_id), test.get_users_no_per_answer(q_id)):
        print(f'{anspct[0]} {anspct[1]}% sum({anssum[1]})')
    print()

Question 6
I don't know 15.39% sum(649)
No 40.33% sum(1701)
Yes 44.29% sum(1868)

Question 7
0 42.82% sum(1806)
1 57.18% sum(2412)

Question 9
0 19.58% sum(826)
1 68.47% sum(2888)
n/a 11.95% sum(504)

Question 28
I don't know 6.42% sum(190)
No, at none of my previous employers 14.06% sum(416)
No, none of my previous supervisors 16.4% sum(485)
Some of my previous employers 22.11% sum(654)
Some of my previous supervisors 22.11% sum(654)
Yes, all of my previous supervisors 3.31% sum(98)
Yes, at all of my previous employers 3.14% sum(93)
n/a 12.44% sum(368)

Question 30
Neutral 11.12% sum(329)
Not applicable to me (I do not have a mental illness) 3.79% sum(112)
Not open at all 11.22% sum(332)
Somewhat not open 9.8% sum(290)
Somewhat open 26.74% sum(791)
Very open 37.32% sum(1104)

Question 32
Don't Know 3.68% sum(109)
Maybe 8.32% sum(246)
No 30.29% sum(896)
Possibly 9.3% sum(275)
Yes 47.9% sum(1417)
n/a 0.51% sum(15)

Question 33
Don't Know 4.19% sum(124)
Maybe 11.05% sum(327)
No 32.76% su