In [None]:
import pandas as pd

# PRE-PROCESSAMENTO
selected_columns = ['Q3A', 'Q5A', 'Q10A', 'Q13A', 'Q16A', 'Q17A', 'Q21A', 'Q24A', 'Q26A', 'Q31A', 'Q34A', 'Q37A', 'Q38A', 'Q42A',
                    'gender', 'country', 'education', 'age', 'married']


file_path = 'originalDataset.csv'

# selecionar apenas as colunas desejadas
df = pd.read_csv(file_path, delimiter='\t', usecols=selected_columns)

#  linhas inicialmente no original
initial_row_count = len(df)
print(f"Número inicial de linhas no originalDataset: {initial_row_count}")

#  'NONE' por NaN
df.replace('NONE', pd.NA, inplace=True)

# Eliminar as linhas com NaN (valores vazios) em qualquer coluna
df.dropna(inplace=True)

#  filtragem
criteria = {
    'education': (1, 4),
    'gender': (1, 3),
    'married': (1, 3),
    'age': (18, 100)
}

# critério de filtragem ao DataFrame original
for column, (low, high) in criteria.items():
    df = df[df[column].between(low, high)]

# participantes por país
country_counts = df['country'].value_counts()

# países com 30 ou mais participantes
countries_with_min_participants = country_counts[country_counts >= 30].index

# apenas os países com 30 ou mais participantes
df_filtered = df[df['country'].isin(countries_with_min_participants)]

# qts de linhas após todos os filtros (antes da seleção das 24.655 linhas)
filtered_row_count = len(df_filtered)
print(f"Número de linhas após todos os filtros: {filtered_row_count}")


df_balanced = df_filtered.groupby('country').apply(lambda x: x.sample(n=30, random_state=1))


df_balanced.reset_index(drop=True, inplace=True)

#  se o número de linhas restantes é menor que 24.655 e completar com amostras aleatórias
desired_row_count = 24655
remaining_rows_balanced = len(df_balanced)

if remaining_rows_balanced < desired_row_count:
    remaining_needed = desired_row_count - remaining_rows_balanced
    additional_samples = df_filtered.sample(n=remaining_needed, random_state=1)


    df_balanced = pd.concat([df_balanced, additional_samples], ignore_index=True)

# Subtrai 1 das questões
item_columns = ['Q3A', 'Q5A', 'Q10A', 'Q13A', 'Q16A', 'Q17A', 'Q21A', 'Q24A', 'Q26A', 'Q31A', 'Q34A', 'Q37A', 'Q38A', 'Q42A']
df_balanced[item_columns] = df_balanced[item_columns] - 1

# Adicionar a coluna soma_depressao
df_balanced['soma_depressao'] = df_balanced[item_columns].sum(axis=1)

# índice de depressão
def mapear_indice_depressao(pontuacao):
    if pontuacao <= 9:
        return 0
    elif pontuacao <= 13:
        return 1
    elif pontuacao <= 20:
        return 2
    elif pontuacao <= 27:
        return 3
    else:
        return 4

df_balanced['indice_depressao'] = df_balanced['soma_depressao'].apply(mapear_indice_depressao)


df_balanced['country'] = df_balanced['country'].astype('category')

#legenda dos paises
country_legend = {number: country for number, country in enumerate(df_balanced['country'].cat.categories, 1)}
print("Legenda para a coluna 'country':")
for number, country in country_legend.items():
    print(f"{number}: {country}")

# ssubstituir as siglas
df_balanced['country'] = df_balanced['country'].cat.codes + 1

# final
df_balanced.to_csv('dataset.csv', index=False)


final_row_count = len(df_balanced)
print(f"Número de linhas finais no dataset balanceado: {final_row_count}")


country_counts_balanced = df_balanced['country'].value_counts()
print("Quantidade de participantes por país no balancedDataset final:")
print(country_counts_balanced)


Número inicial de linhas no originalDataset: 39775
Número de linhas após todos os filtros: 30893
Legenda para a coluna 'country':
1: AE
2: AR
3: AT
4: AU
5: BE
6: BN
7: BR
8: CA
9: CH
10: CZ
11: DE
12: DK
13: EG
14: ES
15: FI
16: FR
17: GB
18: GR
19: HK
20: HR
21: HU
22: ID
23: IE
24: IN
25: IT
26: JM
27: JP
28: MX
29: MY
30: NL
31: NO
32: NZ
33: PH
34: PK
35: PL
36: PT
37: RO
38: RS
39: RU
40: SA
41: SE
42: SG
43: TR
44: US
45: VN
46: ZA
Número de linhas finais no dataset balanceado: 24655
Quantidade de participantes por país no balancedDataset final:
country
29    14538
44     3974
17      664
22      559
8       545
33      456
4       372
24      307
11      205
42      179
16      153
32      148
7       132
6       126
35      123
25      117
37      104
28      102
30       94
27       90
46       88
43       83
34       81
15       77
13       77
23       75
14       72
18       71
41       68
38       68
19       65
36       64
39       62
40       60
9        59
1        59
2

  df_balanced = df_filtered.groupby('country').apply(lambda x: x.sample(n=30, random_state=1))


'Número inicial de linhas no originalDataset: 39775\nNúmero de linhas após todos os filtros: 30893\n\n1: AE\n2: AR\n3: AT\n4: AU\n5: BE\n6: BN\n7: BR\n8: CA\n9: CH\n10: CZ\n11: DE\n12: DK\n13: EG\n14: ES\n15: FI\n16: FR\n17: GB\n18: GR\n19: HK\n20: HR\n21: HU\n22: ID\n23: IE\n24: IN\n25: IT\n26: JM\n27: JP\n28: MX\n29: MY\n30: NL\n31: NO\n32: NZ\n33: PH\n34: PK\n35: PL\n36: PT\n37: RO\n38: RS\n39: RU\n40: SA\n41: SE\n42: SG\n43: TR\n44: US\n45: VN\n46: ZA'

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
import json

data = pd.read_csv('dataset.csv')

# variáveis independentes
X = data.drop(columns=['soma_depressao', 'indice_depressao'])
# dependente
y = data['indice_depressao']

# 80 x 20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# normalização Min-Max
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

modelos = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'CatBoost': CatBoostClassifier(),
}

# Validação cruzada
resultados = {}
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for nome, modelo in modelos.items():
    scores = []
    for train_idx, test_idx in skf.split(X_train_scaled, y_train):
        X_train_fold, X_test_fold = X_train_scaled.iloc[train_idx], X_train_scaled.iloc[test_idx]
        y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

        # Treinamento
        modelo.fit(X_train_fold, y_train_fold)
        y_pred = modelo.predict(X_test_fold)

        # Calcular a acurácia
        accuracy = accuracy_score(y_test_fold, y_pred)
        scores.append(accuracy)

    # Armazenar o resultado com 4 casas decimais
    resultados[nome] = round(np.mean(scores), 4)
    print(f"{nome} - Acurácia média (Validação Cruzada 10 Folds): {resultados[nome]:.4f}")

# Validação cruzada para o XGBoost
xgb_model = XGBClassifier()
scores_xgb = []
for train_idx, test_idx in skf.split(X_train_scaled, y_train):
    X_train_fold, X_test_fold = X_train_scaled.iloc[train_idx], X_train_scaled.iloc[test_idx]
    y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

    # Treinamento
    xgb_model.fit(X_train_fold, y_train_fold)
    y_pred = xgb_model.predict(X_test_fold)

    # Calcular a acurácia
    accuracy = accuracy_score(y_test_fold, y_pred)
    scores_xgb.append(accuracy)

# Armazenar o resultado do XGBoost com 4 casas decimais
resultados['XGBoost'] = round(np.mean(scores_xgb), 4)
print(f"XGBoost - Acurácia média (Validação Cruzada de 10 Folds): {resultados['XGBoost']:.4f}")

# Salvar os resultados no JSON
with open('metricas_modelos.json', 'w') as f:
    json.dump(resultados, f)


Random Forest - Acurácia média (Validação Cruzada 10 Folds): 0.9267
SVM - Acurácia média (Validação Cruzada 10 Folds): 0.9944


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
6:	learn: 0.9316348	total: 256ms	remaining: 36.3s
7:	learn: 0.8826112	total: 298ms	remaining: 37s
8:	learn: 0.8394890	total: 341ms	remaining: 37.6s
9:	learn: 0.8006895	total: 387ms	remaining: 38.3s
10:	learn: 0.7660009	total: 425ms	remaining: 38.2s
11:	learn: 0.7326435	total: 468ms	remaining: 38.5s
12:	learn: 0.7025882	total: 514ms	remaining: 39s
13:	learn: 0.6749002	total: 554ms	remaining: 39s
14:	learn: 0.6503027	total: 586ms	remaining: 38.5s
15:	learn: 0.6267415	total: 628ms	remaining: 38.6s
16:	learn: 0.6066192	total: 664ms	remaining: 38.4s
17:	learn: 0.5877262	total: 707ms	remaining: 38.5s
18:	learn: 0.5680704	total: 744ms	remaining: 38.4s
19:	learn: 0.5507251	total: 779ms	remaining: 38.2s
20:	learn: 0.5338341	total: 809ms	remaining: 37.7s
21:	learn: 0.5192890	total: 826ms	remaining: 36.7s
22:	learn: 0.5048442	total: 843ms	remaining: 35.8s
23:	learn: 0.4919867	total: 860ms	remaining: 35s
24:	learn: 0.4796662	

In [None]:
# nome no GeoJSON
country_names_from_geojson = [feature['properties']['name'] for feature in geojson_data['features']]
print(country_names_from_geojson)



['Afghanistan', 'Angola', 'Albania', 'United Arab Emirates', 'Argentina', 'Armenia', 'Antarctica', 'French Southern and Antarctic Lands', 'Australia', 'Austria', 'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso', 'Bangladesh', 'Bulgaria', 'The Bahamas', 'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bermuda', 'Bolivia', 'Brazil', 'Brunei', 'Bhutan', 'Botswana', 'Central African Republic', 'Canada', 'Switzerland', 'Chile', 'China', 'Ivory Coast', 'Cameroon', 'Democratic Republic of the Congo', 'Republic of the Congo', 'Colombia', 'Costa Rica', 'Cuba', 'Northern Cyprus', 'Cyprus', 'Czech Republic', 'Germany', 'Djibouti', 'Denmark', 'Dominican Republic', 'Algeria', 'Ecuador', 'Egypt', 'Eritrea', 'Spain', 'Estonia', 'Ethiopia', 'Finland', 'Fiji', 'Falkland Islands', 'France', 'Gabon', 'United Kingdom', 'Georgia', 'Ghana', 'Guinea', 'Gambia', 'Guinea Bissau', 'Equatorial Guinea', 'Greece', 'Greenland', 'Guatemala', 'French Guiana', 'Guyana', 'Honduras', 'Croatia', 'Haiti', 'Hunga

In [None]:
! pip install streamlit

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
! wget -q -O - ipv4.icanhazip.com

34.23.200.150


In [None]:
!npm install -g localtunnel@2.0.2

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K
changed 22 packages in 2s
[1G[0K⠸[1G[0K
[1G[0K⠸[1G[0K3 packages are looking for funding
[1G[0K⠸[1G[0K  run `npm fund` for details
[1G[0K⠸[1G[0K

In [None]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.23.200.150:8501[0m
[0m
[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0Kyour url is: https://shiny-otters-bake.loca.lt
2024-12-31 23:43:26.165 `label` got an empty value. This is discouraged for accessibility reasons and may be disallowed in the future by raising an exception. Please provide a non-empty label and hide it with label_visibility if needed.
2024-12-31 23:43:35.112 `label` got an empty value. This is discouraged for accessibility reasons and may be disallowed in the future by raising an exception. Please provide a non-empty label and hide it with label_visibility if needed.
2024-12-31 23:43:59.101 `label` got an empty value. This is discouraged for ac