In [1]:
# Picture

In [2]:
# Aprasymas kodel reikia stebeti ir rupintis pschine sveikata

In [3]:
import sqlite3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [4]:
from src.sql.sql_all_data import sql_all_data

In [5]:
from src.func.eda_functions import question_df, age_bucket
from src.func.chart_functions import bar_chart

### **Connect to DB and get data**


In [6]:
conn = sqlite3.connect("src/db/mental_health.sqlite")
df = pd.read_sql(sql_all_data, conn)

### **Data Quality**


In [7]:
df.shape

(236898, 5)

In [8]:
df.columns.to_list()

['UserID', 'Year', 'QuestionID', 'questiontext', 'AnswerText']

In [9]:
df.duplicated().sum()

0

In [10]:
df.isnull().sum()

UserID          0
Year            0
QuestionID      0
questiontext    0
AnswerText      0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236898 entries, 0 to 236897
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   UserID        236898 non-null  int64 
 1   Year          236898 non-null  int64 
 2   QuestionID    236898 non-null  int64 
 3   questiontext  236898 non-null  object
 4   AnswerText    236898 non-null  object
dtypes: int64(3), object(2)
memory usage: 9.0+ MB


## **EDA**


### **Unikalus respondentai**


In [12]:
# Kiek buvo unikaliu respondentu
print("Unikaliu respondentu buvo", len(df["UserID"].unique()))

Unikaliu respondentu buvo 4218


In [13]:
# Respondentu pasiskirstymas pagal metus
user_by_year_df = df[["UserID", "Year"]].drop_duplicates().reset_index(drop=True)

In [14]:
user_by_year_gr = user_by_year_df.groupby(["Year"])[["UserID"]].count().reset_index()

In [15]:
bar_chart(
    df=user_by_year_gr, xaxis="Year", yaxis="UserID", title="Unique Users by Year"
)

Paliekame tik nuo 2016 metu, nes galejo buti pasikeitimai ir 2015 tarpas padaryti itaka\
Sukuriame papildoma DF kuriame valysime duomenis ir pridesime reikalingus stulpelius. \
Paliekame duomenis nuo 2016 metu


### **Age**


In [16]:
age_df = df.loc[df["QuestionID"] == 1].rename(columns={"AnswerText": "Age"})
age_df["Age"] = age_df["Age"].astype("int")

In [17]:
px.histogram(data_frame=age_df, x="Age")

### **Gender**


In [None]:
gender_df = question_df(df=df, question_no=2, col_name="Gender")

In [None]:
gender_df["Gender"] = gender_df["Gender"].str.lower()
gender_gr = gender_df["Gender"].value_counts().reset_index()

In [None]:
bar_chart(df=gender_gr, xaxis="Gender", yaxis="count", title="Genders")

Paliekame tik Male ir Female, kitus eliminuojame.


In [None]:
gender_df = gender_df.loc[gender_df["Gender"].isin(["male", "female"])]

### **Country**


In [None]:
country_df = question_df(df=df, question_no=3, col_name="Country").replace(
    "United States of America", "United States"
)

In [None]:
country_gr = country_df["Country"].value_counts().reset_index()

In [None]:
bar_chart(df=country_gr, xaxis="Country", yaxis="count", title="Countrys")

Paliekame tik USA, nes cia daugiausia apklaustuju ir atsako i musu klausima


In [None]:
country_df = country_df.loc[country_df["Country"] == "United States"]

### **Are you self-employed?**


## **Main DF**


In [22]:
main_df = df.copy()

In [23]:
# Paliekame duomenis nuo 2016 ir veliau
main_df = main_df.loc[main_df["Year"] >= 2016]

In [24]:
# Amzius nuo 18 iki 67
main_df = main_df.loc[main_df["QuestionID"] == 1].rename(columns={"AnswerText": "Age"})
main_df["Age"] = main_df["Age"].astype("int")
main_df = main_df.loc[(main_df["Age"] >= 18) & (main_df["Age"] <= 67)]

In [None]:
main_df["Age_group"] = main_df.apply(age_bucket, axis=1)

In [None]:
# Add gender
main_df = pd.merge(main_df, gender_df[["UserID", "Gender"]], how="inner", on="UserID")

In [None]:
# Countrys
main_df = pd.merge(main_df, country_df[["UserID", "Country"]], how="inner", on="UserID")

In [45]:
main_df

Unnamed: 0,UserID,Year,QuestionID,questiontext,Age,Age_group,Gender,Country
0,1262,2016,1,What is your age?,29,25-34,male,United States
1,1265,2016,1,What is your age?,43,35-44,female,United States
2,1267,2016,1,What is your age?,30,25-34,male,United States
3,1268,2016,1,What is your age?,37,35-44,female,United States
4,1269,2016,1,What is your age?,44,35-44,female,United States
...,...,...,...,...,...,...,...,...
1770,4210,2019,1,What is your age?,27,25-34,male,United States
1771,4211,2019,1,What is your age?,42,35-44,female,United States
1772,4213,2019,1,What is your age?,31,25-34,female,United States
1773,4215,2019,1,What is your age?,48,45-54,male,United States
