# Mental Health in the Tech Industry Data Gathering

Let's create a class for gathering the data, statistical calculation and processing data for presentation and visualization.

In [1]:
import sqlite3

In [70]:
class DataProcessing:
    ''' Class for gathering the data, statistical calculation and processing data. '''
    
    def __init__(self, path, db_name):
        self.path = path
        self.db_name = db_name
    
    def get_table(self, table):
        ''' Get table content from the database. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"SELECT * FROM {table}")
        table = c.fetchall()
        conn.commit()
        conn.close()
        return table

    def get_all_answers_per_q(self, q_id):
        ''' Get all answers from Answer table for particular question 
            represented by QuestionID number from the Question table. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT AnswerText FROM Answer WHERE QuestionID = ?", (q_id,))
        answers = set(c.fetchall())
        conn.commit()
        conn.close()
        return answers
    
    def get_users_no_per_q(self, q_id):
        ''' Get number of all answers for the question. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT count(UserID) FROM Answer WHERE QuestionID = ?", (q_id,))
        user_no = c.fetchone()[0]
        conn.commit()
        conn.close()
        return user_no
    
    def get_users_no_for_q_and_answer(self, q_id, answer):
        ''' Get number of users from Answer table for particular question number (QuestionID) and answer (AnswerText). '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT count(UserID) FROM Answer WHERE QuestionID = ? and AnswerText = ?", (q_id, answer))
        user_no = c.fetchone()[0]
        conn.commit()
        conn.close()
        return user_no
    
    def get_users_no_per_answer(self, q_id):
        ''' Get frequency of the answers for particular question number (QuestionID) in Answer table. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT AnswerText, count(UserID) FROM Answer WHERE QuestionID = ? GROUP BY AnswerText", (q_id,))
        user_no = c.fetchall()
        conn.commit()
        conn.close()
        return user_no
    
    def get_table_based_value_from_column(self, table, column, value):
        ''' Get particular answer from provided table. '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"SELECT * FROM {table} WHERE {column} = ?", (value,))
        value = c.fetchall()
        conn.commit()
        conn.close()
        return value

    def get_answers_for_q_less_occ(self, q_id, qty):
        ''' Get answers for provided question q_id where occurency is less than qty. '''
        q_id, qty = int(q_id), int(qty)
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f'SELECT AnswerText, count(UserID) as UNo FROM Answer WHERE QuestionID = ? GROUP BY AnswerText HAVING UNo < ?', 
                  (q_id, qty))
        answers = c.fetchall()
        conn.commit()
        conn.close()
        return answers

    def get_answers_for_q_greater_occ(self, q_id, qty):
        ''' Get answers for provided question q_id where occurency is greater than qty. '''
        q_id, qty = int(q_id), int(qty)
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f'SELECT AnswerText, count(UserID) as UNo FROM Answer WHERE QuestionID = ? GROUP BY AnswerText HAVING UNo > ?', 
                  (q_id, qty))
        answers = c.fetchall()
        conn.commit()
        conn.close()
        return answers
    
    def get_answers_for_q_in_year(self, q_id, year):
        ''' Get all answers from Answer table for particular question (QuestionID)
            in given year (SurveyID). '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT AnswerText FROM Answer WHERE QuestionID = ? and SurveyID = ?", (q_id, year))
        answers = set(c.fetchall())
        conn.commit()
        conn.close()
        return answers
    
    def get_users_no_for_q_and_answer_in_year(self, q_id, answer, year):
        ''' Get number of users from Answer table for particular question number (QuestionID) and answer (AnswerText)
            in given year (SurveyID). '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT count(UserID) FROM Answer WHERE QuestionID = ? and AnswerText = ? and SurveyID = ?", (q_id, answer, year))
        user_no = c.fetchone()[0]
        conn.commit()
        conn.close()
        return user_no
    
    def get_users_no_per_q(self, q_id, year):
        ''' Get number of all answers for the question in given year (SurveyID). '''
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute("SELECT count(UserID) FROM Answer WHERE QuestionID = ? and SurveyID = ?", (q_id, year))
        user_no = c.fetchone()[0]
        conn.commit()
        conn.close()
        return user_no
    
    def get_answers_for_questions(self, q_ids):
        ''' Get table content for given questions. '''
        values_no = ('?, '*len(q_ids))[:-2]
        conn = sqlite3.connect(f'{self.path}{self.db_name}.sqlite')
        c = conn.cursor()
        c.execute(f"SELECT * FROM Answer WHERE QuestionID = {values_no}", (*q_ids))
        table = c.fetchall()
        conn.commit()
        conn.close()
        return table
    
    
    # [] data from particular year (SurveyID) 
    # [] count particular answers in year
    # [] count all answers in year
    # [] table for two questions / or many questions
    # table for two questions and answers (one true, second false) <-- to get people who work in different country that they leave (remote work counts or not? Not sure)
    # get all answers based on one particular answer (e.g. table only for women)
    #
    # some statistics 
    # % of all, % distribution
    # mean, median for num answers
    # mean, median distribution (eg. q_id 34 along the years)
    # 

In [71]:
test = DataProcessing('DB/', 'mental_health')

In [72]:
test.get_users_no_for_q_and_answer(2, 'Male')

3044

In [73]:
test.get_users_no_per_answer(2)

[('Female', 1024),
 ('Male', 3044),
 ('Non-binary', 13),
 ('Other', 107),
 ('n/a', 24),
 ('non-binary', 6)]

In [74]:
test.get_users_no_per_q(2)

4218

In [75]:
test.get_table_based_value_from_column('Answer', 'AnswerText', 'Idaho')

[('Idaho', 2014, 420, 4),
 ('Idaho', 2016, 1549, 4),
 ('Idaho', 2016, 2364, 4),
 ('Idaho', 2016, 2663, 4),
 ('Idaho', 2016, 2364, 51),
 ('Idaho', 2016, 2663, 51),
 ('Idaho', 2017, 2838, 4),
 ('Idaho', 2017, 3419, 4),
 ('Idaho', 2017, 2838, 51),
 ('Idaho', 2017, 3419, 51),
 ('Idaho', 2018, 3612, 4),
 ('Idaho', 2018, 3612, 51),
 ('Idaho', 2019, 4197, 4),
 ('Idaho', 2019, 4197, 51)]

In [76]:
test.get_answers_for_q_less_occ(2, 10)

[('non-binary', 6)]

In [77]:
test.get_answers_for_q_greater_occ(2, '10')

[('Female', 1024),
 ('Male', 3044),
 ('Non-binary', 13),
 ('Other', 107),
 ('n/a', 24)]