#### Instalando as bibliotecas necessárias

In [93]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


#### Importando as bibliotecas

In [94]:
# Importando tudo que a análise econometrica precisa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pyreadstat
import seaborn as sns
import pandasql as ps

#### Importando os dados em formato sav

In [95]:
# Lê os arquivos .sav
df_general, meta1 = pyreadstat.read_sav('data/Deidentified_WPH003a Economy (General Population) (SPSS Version).sav')
df_economists, meta2 = pyreadstat.read_sav('data/Deidentified_WPH003b Economy (Economists) (SPSS Version).sav')

# Função para criar um DataFrame com labels
def create_label_df(df, meta):
    df_labels = df.copy()  # Copia o DataFrame original
    for column in df_labels.columns:
        if column in meta.variable_value_labels:  # Verifica se há rótulos para a coluna
            labels_dict = meta.variable_value_labels[column]  # Dicionário de rótulos {valor_numérico: label}
            df_labels[column] = df_labels[column].map(labels_dict)  # Mapeia os códigos para rótulos
    return df_labels

# Cria DataFrames com labels para ambos os arquivos
df_general_labels = create_label_df(df_general, meta1)
df_economists_labels = create_label_df(df_economists, meta2)

In [96]:
# criando um df de dicionário de labels

dicionario_general = pd.DataFrame(meta1.column_names_to_labels.items(), columns=['coluna', 'label'])
dicionario_economists = pd.DataFrame(meta2.column_names_to_labels.items(), columns=['coluna', 'label'])

In [97]:
# juntando os dois dicionários onde as labels são iguais, indentificando onde vem de general e onde vem de economists

dicionario = pd.merge(dicionario_general, dicionario_economists, on='label', how='outer', suffixes=('_general', '_economists'))

dicionario.to_clipboard(excel=True)

dicionario

Unnamed: 0,coluna_general,label,coluna_economists
0,q4,"% OF AMERICANS UNEMPLOYED, LOOKING FOR WORK?",
1,q6,% OF PROFIT AMERICAN CORPORATIONS?,
2,,2ND MOST IMPORTANT PROBLEM FACING COUNTRY?,q8
3,q7,5 YEARS AGO: FEDERAL BUDGET DEFICIT?,
4,q3b1,5 YEARS AGO: INFLATION RATE?,
...,...,...,...
262,wt1,,pri
263,wt1,,sic
264,wt1,,title
265,wt1,,effort


In [98]:
# Criando uma cópia de df_general com o prefixo "cod_"
df_general_renamed = df_general.copy()
df_general_renamed.columns = ['cod_' + col for col in df_general_renamed.columns]

# Unindo a descrição de resposta (df_general_labels) com a resposta codificada (df_general_renamed) usando pandasql
query = '''
SELECT 
    df_general_labels.*, df_general_renamed.*
FROM
    df_general_labels
JOIN
    df_general_renamed
ON
    df_general_labels.intv = df_general_renamed.cod_intv
'''

df_general_union = ps.sqldf(query, locals())

df_general_union.head()


Unnamed: 0,intv,q901,q905,q908,q910,q918,race,sex,state,region,...,cod_q46,cod_q47,cod_q48,cod_q49,cod_q50,cod_q51,cod_q55,cod_q56,cod_q919,cod_wt1
0,1.0,DEMOCRAT,YES,LIBERAL,,$25000-29999,WHITE,FEMALE,37.0,SOUTH,...,6.0,,2.0,,,2.0,5.0,6.0,2.0,0.89
1,2.0,REPUBLICAN,NO,VERY CONSERVATIVE,,$50000-74999,WHITE,FEMALE,37.0,SOUTH,...,1.0,1.0,1.0,1.0,1.0,1.0,6.0,7.0,2.0,1.0
2,3.0,DEMOCRAT,NO,MODERATE,,$10000-19999,WHITE,FEMALE,27.0,SOUTH,...,,,2.0,,,2.0,3.0,,2.0,1.23
3,4.0,INDEPENDENT,YES,MODERATE,,$50000-74999,WHITE,FEMALE,37.0,SOUTH,...,1.0,1.0,2.0,,,2.0,5.0,5.0,2.0,0.88
4,5.0,INDEPENDENT,YES,MODERATE,,$50000-74999,WHITE,MALE,35.0,SOUTH,...,1.0,1.0,2.0,,,1.0,5.0,5.0,2.0,0.94


In [99]:
# Criando uma cópia de df_economists com o prefixo "cod_"
df_economists_renamed = df_economists.copy()
df_economists_renamed.columns = ['cod_' + col for col in df_economists_renamed.columns]

# Unindo a descrição de resposta (df_economists_labels) com a resposta codificada (df_economists_renamed) usando pandasql
query = '''
SELECT 
    df_economists_labels.*, df_economists_renamed.*
FROM
    df_economists_labels
JOIN
    df_economists_renamed
ON
    df_economists_labels.intv = df_economists_renamed.cod_intv
'''

df_economists_union = ps.sqldf(query, locals())

df_economists_union


Unnamed: 0,intv,q901,q905,q908,q910,q918,race,sex,state,region,...,cod_q44,cod_q45,cod_q919,cod_q115,cod_seq,cod_pri,cod_sic,cod_title,cod_effort,cod_expdate
0,1.0,,,,,,,,24.0,SOUTH,...,,,,,1197.0,3.0,0.0,99.0,,9706.0
1,2.0,INDEPENDENT,YES,MODERATE,,$100000 OR MORE,WHITE,MALE,24.0,SOUTH,...,1.0,3.0,2.0,2.0,1028.0,0.0,0.0,0.0,N50301,9704.0
2,3.0,INDEPENDENT,YES,MODERATE,,$75000-99999,WHITE,MALE,24.0,SOUTH,...,1.0,3.0,2.0,2.0,845.0,0.0,0.0,99.0,,9706.0
3,4.0,DEMOCRAT,YES,MODERATE,,$50000-74999,WHITE,MALE,24.0,SOUTH,...,2.0,,2.0,2.0,836.0,0.0,0.0,99.0,,9612.0
4,5.0,,,,,,,,24.0,SOUTH,...,,,,,933.0,0.0,0.0,99.0,,9703.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,595.0,REPUBLICAN,YES,MODERATE,,$50000-74999,WHITE,MALE,10.0,MIDWEST,...,1.0,1.0,2.0,2.0,1966.0,0.0,5.0,99.0,,9709.0
595,596.0,,,,,,,,8.0,EAST,...,,,,,332.0,0.0,0.0,99.0,,9612.0
596,597.0,,,,,,,,23.0,SOUTH,...,,,,,1367.0,0.0,0.0,99.0,,9706.0
597,598.0,REPUBLICAN,YES,CONSERVATIVE,,$100000 OR MORE,WHITE,MALE,49.0,WEST,...,2.0,,2.0,2.0,2830.0,0.0,0.0,99.0,,9706.0


In [100]:

# criando as variáveis de controle

query = '''
SELECT 
    df_general_union.*,
    0 AS econ,
    
    CASE 
        WHEN cod_sex = 1 THEN 1 
        ELSE 0 
    END AS male,
    
    1996 - q910 AS age_in_1996,
    
    CASE 
        WHEN cod_race NOT IN (1, 2, 3) THEN 1 
        ELSE 0 
    END AS othrace,
    
    CASE 
        WHEN cod_race = 1 THEN 1 
        ELSE 0 
    END AS white,
    
    CASE 
        WHEN cod_race = 3 THEN 1 
        ELSE 0 
    END AS asian,
    
    CASE 
        WHEN cod_race = 2 THEN 1 
        ELSE 0 
    END AS black,
    
    CASE 
        WHEN cod_q36 = 4 THEN 3
        WHEN cod_q36 = 3 THEN 2
        WHEN cod_q36 = 2 THEN 1
        WHEN cod_q36 = 1 THEN 0
        ELSE NULL 
    END AS jobsecurity,
    
    CASE 
        WHEN cod_q15 = 3 THEN 0
        WHEN cod_q15 = 2 THEN 1
        WHEN cod_q15 = 1 THEN 2
        ELSE NULL 
    END AS yourlast5,
    
    CASE 
        WHEN cod_q17 = 2 THEN 0
        WHEN cod_q17 = 3 THEN 1
        WHEN cod_q17 = 1 THEN 2
        ELSE NULL 
    END AS yournext5,
    
    CASE 
        WHEN cod_q918 BETWEEN 1 AND 9 THEN cod_q918 
        ELSE NULL 
    END AS income,
    
    CASE 
        WHEN cod_q901 = 1 THEN 1 
        ELSE 0 
    END AS dem,
    
    CASE 
        WHEN cod_q901 = 2 THEN 1 
        ELSE 0 
    END AS rep,
    
    CASE 
        WHEN cod_q901 = 3 THEN 1 
        ELSE 0 
    END AS indep,
    
    CASE 
        WHEN cod_q901 = 4 THEN 1 
        ELSE 0 
    END AS othparty,
    
    CASE 
        WHEN cod_q908 = 1 THEN -2
        WHEN cod_q908 = 2 THEN -1
        WHEN cod_q908 = 3 THEN 0
        WHEN cod_q908 = 4 THEN 1
        WHEN cod_q908 = 5 THEN 2
        ELSE NULL 
    END AS ideology,
    
    CASE 
        WHEN cod_q908 = 6 THEN 1 
        ELSE 0 
    END AS othideol,
    
    CASE 
        WHEN q55 = 'None' THEN 1
        WHEN cod_q55 BETWEEN 1 AND 7 THEN cod_q55
        ELSE NULL 
    END AS education,
    
    df_general_union.q27a1 AS TAXESHIGH,
    df_general_union.q27b1 AS DEFICIT,
    df_general_union.q27c1 AS FORAID,
    df_general_union.q27d1 AS IMMIG,
    df_general_union.q27e1 AS TAXBREAK,
    df_general_union.q27f1 AS INADEDUC,
    df_general_union.q27g1 AS WELFARE,
    df_general_union.q27h1 AS AA,
    df_general_union.q27i1 AS HARDWORK,
    df_general_union.q27j1 AS REG,
    df_general_union.q27k1 AS SAVINGS,
    df_general_union.q29l1 AS PROFHIGH,
    df_general_union.q29m1 AS EXECPAY,
    df_general_union.q29n1 AS BUSPROD,
    df_general_union.q29o1 AS TECH,
    df_general_union.q29p1 AS OVERSEAS,
    df_general_union.q29q1 AS DOWNSIZE,
    df_general_union.q29r1 AS COMPEDUC,
    df_general_union.q22a1 AS TAXCUT,
    df_general_union.q22b1 AS WOMENWORK,
    df_general_union.q22c1 AS TECHGOOD,
    df_general_union.q22d1 AS TRADEAG,
    df_general_union.q22e1 AS DOWNGOOD,
    df_general_union.q13 AS CHANGE20,
    df_general_union.q24 AS TRADEJOB,
    df_general_union.q26 AS WHYGASSD,
    df_general_union.q25 AS GASPRICE,
    df_general_union.q21 AS PRES,
    df_general_union.q9 AS NEWJOB,
    df_general_union.q10 AS GAP20,
    df_general_union.q11 AS INCOME20,
    df_general_union.q12 AS WAGE20,
    df_general_union.q13 AS NEED2EARN,
    df_general_union.q14 AS STAN5,
    df_general_union.q18 AS CHILDGEN,
    df_general_union.q49 AS CHILDSTAN,
    df_general_union.q1 AS CURECON

FROM 
    df_general_union
'''

df_general_union = ps.sqldf(query, locals())

df_general_union

Unnamed: 0,intv,q901,q905,q908,q910,q918,race,sex,state,region,...,PRES,NEWJOB,GAP20,INCOME20,WAGE20,NEED2EARN,STAN5,CHILDGEN,CHILDSTAN,CURECON
0,1.0,DEMOCRAT,YES,LIBERAL,,$25000-29999,WHITE,FEMALE,37.0,SOUTH,...,DELINE IN MORAL VALUES,LOW-PAYING,LARGER,FALLING BEHIND,FALLING BEHIND,TWO WAGE EARNERS,FALL,LOWER,,GROWING SLOWLY
1,2.0,REPUBLICAN,NO,VERY CONSERVATIVE,,$50000-74999,WHITE,FEMALE,37.0,SOUTH,...,NOT ENOUGH MONEY,PAY WELL,ABOUT THE SAME,STAYING EVEN,STAYING EVEN,ONE WAGE EARNER,STAYED SAME,HIGHER,HIGHER,GROWING SLOWLY
2,3.0,DEMOCRAT,NO,MODERATE,,$10000-19999,WHITE,FEMALE,27.0,SOUTH,...,NOT ENOUGH MONEY,LOW-PAYING,ABOUT THE SAME,STAYING EVEN,STAYING EVEN,TWO WAGE EARNERS,RISE,HIGHER,,GROWING SLOWLY
3,4.0,INDEPENDENT,YES,MODERATE,,$50000-74999,WHITE,FEMALE,37.0,SOUTH,...,HEALTH,LOW-PAYING,LARGER,STAYING EVEN,STAYING EVEN,ONE WAGE EARNER,STAYED SAME,HIGHER,,IN DEPRESSION
4,5.0,INDEPENDENT,YES,MODERATE,,$50000-74999,WHITE,MALE,35.0,SOUTH,...,HEALTH,LOW-PAYING,LARGER,FALLING BEHIND,FALLING BEHIND,TWO WAGE EARNERS,STAYED SAME,HIGHER,,STAGNATING
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,1506.0,INDEPENDENT,NO,LIBERAL,,$30000-39999,WHITE,MALE,30.0,SOUTH,...,HEALTH,LOW-PAYING,LARGER,FALLING BEHIND,FALLING BEHIND,TWO WAGE EARNERS,RISE,LOWER,,IN RECESSION
1506,1507.0,DEMOCRAT,YES,MODERATE,,$50000-74999,WHITE,FEMALE,25.0,SOUTH,...,HEALTH,LOW-PAYING,LARGER,FALLING BEHIND,FALLING BEHIND,TWO WAGE EARNERS,FALL,LOWER,STAY SAME,IN RECESSION
1507,1508.0,INDEPENDENT,NO,MODERATE,,$10000-19999,,MALE,33.0,SOUTH,...,HEALTH,LOW-PAYING,LARGER,FALLING BEHIND,FALLING BEHIND,TWO WAGE EARNERS,FALL,DK/NO OPINION,,STAGNATING
1508,1509.0,DEMOCRAT,YES,LIBERAL,,$40000-49999,BLACK,FEMALE,26.0,SOUTH,...,HEALTH,LOW-PAYING,ABOUT THE SAME,GOING UP,FALLING BEHIND,TWO WAGE EARNERS,STAYED SAME,HIGHER,HIGHER,GROWING RAPIDLY


In [101]:
# criando as variáveis de controle

query = '''
SELECT
      df_economists_union.*
    , 1 as econ
    , CASE
          WHEN cod_sex = 1
          THEN 1
          ELSE 0
      END AS male
    , 1996 - q910 AS age_in_1996
    , CASE
          WHEN cod_race NOT IN (1, 2, 3)
          THEN 1
          ELSE 0
      END AS othrace
    , CASE
          WHEN cod_race = 1
          THEN 1
          ELSE 0
      END AS white
    , CASE
          WHEN cod_race = 3
          THEN 1
          ELSE 0
      END AS asian
    , CASE
          WHEN cod_race = 2
          THEN 1
          ELSE 0
      END AS black
    , CASE
          WHEN cod_q39 = 4 THEN 3
          WHEN cod_q39 = 3 THEN 2
          WHEN cod_q39 = 2 THEN 1
          WHEN cod_q39 = 1 THEN 0
          ELSE NULL
      END AS jobsecurity
    , CASE
          WHEN cod_q17 = 3 THEN 0
          WHEN cod_q17 = 2 THEN 1
          WHEN cod_q17 = 1 THEN 2
          ELSE NULL
      END AS yourlast5
    , CASE
          WHEN cod_q19 = 2 THEN 0
          WHEN cod_q19 = 3 THEN 1
          WHEN cod_q19 = 1 THEN 2
          ELSE NULL
      END AS yournext5
    , CASE
          WHEN cod_q918 = 1 THEN 1
          WHEN cod_q918 = 2 THEN 2
          WHEN cod_q918 = 3 THEN 3
          WHEN cod_q918 = 4 THEN 4
          WHEN cod_q918 = 5 THEN 5
          WHEN cod_q918 = 6 THEN 6
          WHEN cod_q918 = 7 THEN 7
          WHEN cod_q918 = 8 THEN 8
          WHEN cod_q918 = 9 THEN 9
          ELSE NULL
      END AS income
    , CASE
          WHEN cod_q901 = 1 THEN 1
          ELSE 0
      END AS dem
    , CASE
          WHEN cod_q901 = 2 THEN 1
          ELSE 0
      END AS rep
    , CASE
          WHEN cod_q901 = 3 THEN 1
          ELSE 0
      END AS indep
    , CASE
          WHEN cod_q901 = 4 THEN 1
          ELSE 0
      END AS othparty
    , CASE
          WHEN cod_q908 = 1 THEN -2
          WHEN cod_q908 = 2 THEN -1
          WHEN cod_q908 = 3 THEN 0
          WHEN cod_q908 = 4 THEN 1
          WHEN cod_q908 = 5 THEN 2
          ELSE NULL
      END AS ideology
    , CASE
          WHEN cod_q908 = 6 THEN 1
          ELSE 0
      END AS othideol
    , 7 as education
    , df_economists_union.q34a1  AS TAXESHIGH
    , df_economists_union.q34b1  AS DEFICIT
    , df_economists_union.q34c1  AS FORAID
    , df_economists_union.q34d1  AS IMMIG
    , df_economists_union.q34e1  AS TAXBREAK
    , df_economists_union.q34f1  AS INADEDUC
    , df_economists_union.q34g1  AS WELFARE
    , df_economists_union.q34h1  AS AA
    , df_economists_union.q34i1  AS HARDWORK
    , df_economists_union.q34j1  AS REG
    , df_economists_union.q34k1  AS SAVINGS
    , df_economists_union.q36a1  AS PROFHIGH
    , df_economists_union.q36b1  AS EXECPAY
    , df_economists_union.q36c1  AS BUSPROD
    , df_economists_union.q36d1  AS TECH
    , df_economists_union.q36e1  AS OVERSEAS
    , df_economists_union.q36f1  AS DOWNSIZE
    , df_economists_union.q36g1  AS COMPEDUC
    , df_economists_union.q26a1  AS TAXCUT
    , df_economists_union.q26b1  AS WOMENWORK
    , df_economists_union.q26c1  AS TECHGOOD
    , df_economists_union.q26d1  AS TRADEAG
    , df_economists_union.q26e1  AS DOWNGOOD
    , df_economists_union.q15    AS CHANGE20
    , df_economists_union.q28    AS TRADEJOB
    , df_economists_union.q33    AS WHYGASSD
    , df_economists_union.q32    AS GASPRICE
    , df_economists_union.q24    AS PRES
    , df_economists_union.q13    AS INCOME20
    , df_economists_union.q14    AS WAGE20
    , df_economists_union.q15    AS NEED2EARN
    , df_economists_union.q16    AS STAN5
    , df_economists_union.q20    AS CHILDGEN
    , df_economists_union.q45    AS CHILDSTAN
FROM
    df_economists_union
'''

df_economists_union = ps.sqldf(query, locals())

df_economists_union

Unnamed: 0,intv,q901,q905,q908,q910,q918,race,sex,state,region,...,TRADEJOB,WHYGASSD,GASPRICE,PRES,INCOME20,WAGE20,NEED2EARN,STAN5,CHILDGEN,CHILDSTAN
0,1.0,,,,,,,,24.0,SOUTH,...,,,,,,,,,,
1,2.0,INDEPENDENT,YES,MODERATE,,$100000 OR MORE,WHITE,MALE,24.0,SOUTH,...,CREATE JOBS,SUPPLY AND DEMAND,TOO LOW,DO LITTLE ABOUT,STAYING EVEN,STAYING EVEN,,STAYED SAME,LOWER,STAY SAME
2,3.0,INDEPENDENT,YES,MODERATE,,$75000-99999,WHITE,MALE,24.0,SOUTH,...,NO DIFFERENCE,SUPPLY AND DEMAND,ABOUT RIGHT,DO LITTLE ABOUT,GOING UP,STAYING EVEN,ONE WAGE EARNER,RISE,STAY SAME,STAY SAME
3,4.0,DEMOCRAT,YES,MODERATE,,$50000-74999,WHITE,MALE,24.0,SOUTH,...,DK/NO OPINION,SUPPLY AND DEMAND,DK/NO OPINION,DO LITTLE ABOUT,FALLING BEHIND,FALLING BEHIND,TWO WAGE EARNERS,STAYED SAME,LOWER,
4,5.0,,,,,,,,24.0,SOUTH,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,595.0,REPUBLICAN,YES,MODERATE,,$50000-74999,WHITE,MALE,10.0,MIDWEST,...,CREATE JOBS,SUPPLY AND DEMAND,ABOUT RIGHT,BEYOND PRESIDENT CONTROL,GOING UP,GOING UP,ONE WAGE EARNER,RISE,HIGHER,HIGHER
595,596.0,,,,,,,,8.0,EAST,...,,,,,,,,,,
596,597.0,,,,,,,,23.0,SOUTH,...,,,,,,,,,,
597,598.0,REPUBLICAN,YES,CONSERVATIVE,,$100000 OR MORE,WHITE,MALE,49.0,WEST,...,NO DIFFERENCE,SUPPLY AND DEMAND,,BEYOND PRESIDENT CONTROL,GOING UP,STAYING EVEN,TWO WAGE EARNERS,RISE,HIGHER,


In [102]:
common_columns = df_general_union.columns.intersection(df_economists_union.columns)

common_columns

Index(['intv', 'q901', 'q905', 'q908', 'q910', 'q918', 'race', 'sex', 'state',
       'region',
       ...
       'TRADEJOB', 'WHYGASSD', 'GASPRICE', 'PRES', 'INCOME20', 'WAGE20',
       'NEED2EARN', 'STAN5', 'CHILDGEN', 'CHILDSTAN'],
      dtype='object', length=144)

In [103]:
common_columns = df_general_union.columns.intersection(df_economists_union.columns)

df_general_common = df_general_union[common_columns]
df_economists_common = df_economists_union[common_columns]

# Concatenar os DataFrames
df_union = pd.concat([df_general_common, df_economists_common], ignore_index=True)

df_union

  df_union = pd.concat([df_general_common, df_economists_common], ignore_index=True)


Unnamed: 0,intv,q901,q905,q908,q910,q918,race,sex,state,region,...,TRADEJOB,WHYGASSD,GASPRICE,PRES,INCOME20,WAGE20,NEED2EARN,STAN5,CHILDGEN,CHILDSTAN
0,1.0,DEMOCRAT,YES,LIBERAL,,$25000-29999,WHITE,FEMALE,37.0,SOUTH,...,COST JOBS,INCREASE PROFITS,PRESIDENT,DELINE IN MORAL VALUES,FALLING BEHIND,FALLING BEHIND,TWO WAGE EARNERS,FALL,LOWER,
1,2.0,REPUBLICAN,NO,VERY CONSERVATIVE,,$50000-74999,WHITE,FEMALE,37.0,SOUTH,...,CREATE JOBS,NEITHER,ABOUT THE SAME,NOT ENOUGH MONEY,STAYING EVEN,STAYING EVEN,ONE WAGE EARNER,STAYED SAME,HIGHER,HIGHER
2,3.0,DEMOCRAT,NO,MODERATE,,$10000-19999,WHITE,FEMALE,27.0,SOUTH,...,COST JOBS,SUPPLY AND DEMAND,PRESIDENT,NOT ENOUGH MONEY,STAYING EVEN,STAYING EVEN,TWO WAGE EARNERS,RISE,HIGHER,
3,4.0,INDEPENDENT,YES,MODERATE,,$50000-74999,WHITE,FEMALE,37.0,SOUTH,...,COST JOBS,INCREASE PROFITS,PRESIDENT,HEALTH,STAYING EVEN,STAYING EVEN,ONE WAGE EARNER,STAYED SAME,HIGHER,
4,5.0,INDEPENDENT,YES,MODERATE,,$50000-74999,WHITE,MALE,35.0,SOUTH,...,COST JOBS,INCREASE PROFITS,PRESIDENT,HEALTH,FALLING BEHIND,FALLING BEHIND,TWO WAGE EARNERS,STAYED SAME,HIGHER,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2104,595.0,REPUBLICAN,YES,MODERATE,,$50000-74999,WHITE,MALE,10.0,MIDWEST,...,CREATE JOBS,SUPPLY AND DEMAND,ABOUT RIGHT,BEYOND PRESIDENT CONTROL,GOING UP,GOING UP,ONE WAGE EARNER,RISE,HIGHER,HIGHER
2105,596.0,,,,,,,,8.0,EAST,...,,,,,,,,,,
2106,597.0,,,,,,,,23.0,SOUTH,...,,,,,,,,,,
2107,598.0,REPUBLICAN,YES,CONSERVATIVE,,$100000 OR MORE,WHITE,MALE,49.0,WEST,...,NO DIFFERENCE,SUPPLY AND DEMAND,,BEYOND PRESIDENT CONTROL,GOING UP,STAYING EVEN,TWO WAGE EARNERS,RISE,HIGHER,


# preparando o df para as regressões

## selecionando as variáveis de controle

In [104]:
## Selecionando somente as renomeadas e a variável de controle

df_filtered = df_union[['econ', 'male', 'othrace', 'white', 'asian', 'black', 'jobsecurity', 'yourlast5', 'yournext5', 'income', 'dem', 'rep', 'indep', 'othparty', 'ideology', 'othideol', 'education', 'TAXESHIGH', 'DEFICIT', 'FORAID', 'IMMIG', 'TAXBREAK', 'INADEDUC', 'WELFARE', 'AA', 'HARDWORK', 'REG', 'SAVINGS', 'PROFHIGH', 'EXECPAY', 'BUSPROD', 'TECH', 'OVERSEAS', 'DOWNSIZE', 'COMPEDUC', 'TAXCUT', 'WOMENWORK', 'TECHGOOD', 'TRADEAG', 'DOWNGOOD', 'CHANGE20', 'TRADEJOB', 'WHYGASSD', 'GASPRICE', 'PRES', 'INCOME20', 'WAGE20', 'NEED2EARN', 'STAN5', 'CHILDGEN', 'CHILDSTAN']].copy()

df_filtered

Unnamed: 0,econ,male,othrace,white,asian,black,jobsecurity,yourlast5,yournext5,income,...,TRADEJOB,WHYGASSD,GASPRICE,PRES,INCOME20,WAGE20,NEED2EARN,STAN5,CHILDGEN,CHILDSTAN
0,0,0,0,1,0,0,1.0,0.0,0.0,4.0,...,COST JOBS,INCREASE PROFITS,PRESIDENT,DELINE IN MORAL VALUES,FALLING BEHIND,FALLING BEHIND,TWO WAGE EARNERS,FALL,LOWER,
1,0,0,0,1,0,0,2.0,2.0,2.0,7.0,...,CREATE JOBS,NEITHER,ABOUT THE SAME,NOT ENOUGH MONEY,STAYING EVEN,STAYING EVEN,ONE WAGE EARNER,STAYED SAME,HIGHER,HIGHER
2,0,0,0,1,0,0,3.0,2.0,1.0,2.0,...,COST JOBS,SUPPLY AND DEMAND,PRESIDENT,NOT ENOUGH MONEY,STAYING EVEN,STAYING EVEN,TWO WAGE EARNERS,RISE,HIGHER,
3,0,0,0,1,0,0,3.0,1.0,1.0,7.0,...,COST JOBS,INCREASE PROFITS,PRESIDENT,HEALTH,STAYING EVEN,STAYING EVEN,ONE WAGE EARNER,STAYED SAME,HIGHER,
4,0,1,0,1,0,0,3.0,2.0,1.0,7.0,...,COST JOBS,INCREASE PROFITS,PRESIDENT,HEALTH,FALLING BEHIND,FALLING BEHIND,TWO WAGE EARNERS,STAYED SAME,HIGHER,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2104,1,1,0,1,0,0,3.0,2.0,2.0,7.0,...,CREATE JOBS,SUPPLY AND DEMAND,ABOUT RIGHT,BEYOND PRESIDENT CONTROL,GOING UP,GOING UP,ONE WAGE EARNER,RISE,HIGHER,HIGHER
2105,1,0,0,0,0,0,,,,,...,,,,,,,,,,
2106,1,0,0,0,0,0,,,,,...,,,,,,,,,,
2107,1,1,0,1,0,0,3.0,1.0,1.0,9.0,...,NO DIFFERENCE,SUPPLY AND DEMAND,,BEYOND PRESIDENT CONTROL,GOING UP,STAYING EVEN,TWO WAGE EARNERS,RISE,HIGHER,


In [105]:
# Query para codificação das respostas das perguntas
query = '''
SELECT
    CASE 
        WHEN TAXESHIGH = 'NOT A REASON AT ALL' THEN 0
        WHEN TAXESHIGH = 'MINOR REASON' THEN 1
        WHEN TAXESHIGH = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS TAXESHIGH,
    
    CASE 
        WHEN DEFICIT = 'NOT A REASON AT ALL' THEN 0
        WHEN DEFICIT = 'MINOR REASON' THEN 1
        WHEN DEFICIT = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS DEFICIT,
    
    CASE 
        WHEN FORAID = 'NOT A REASON AT ALL' THEN 0
        WHEN FORAID = 'MINOR REASON' THEN 1
        WHEN FORAID = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS FORAID,
    
    CASE 
        WHEN IMMIG = 'NOT A REASON AT ALL' THEN 0
        WHEN IMMIG = 'MINOR REASON' THEN 1
        WHEN IMMIG = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS IMMIG,
    
    CASE 
        WHEN TAXBREAK = 'NOT A REASON AT ALL' THEN 0
        WHEN TAXBREAK = 'MINOR REASON' THEN 1
        WHEN TAXBREAK = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS TAXBREAK,
    
    CASE 
        WHEN INADEDUC = 'NOT A REASON AT ALL' THEN 0
        WHEN INADEDUC = 'MINOR REASON' THEN 1
        WHEN INADEDUC = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS INADEDUC,
    
    CASE 
        WHEN WELFARE = 'NOT A REASON AT ALL' THEN 0
        WHEN WELFARE = 'MINOR REASON' THEN 1
        WHEN WELFARE = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS WELFARE,
    
    CASE 
        WHEN AA = 'NOT A REASON AT ALL' THEN 0
        WHEN AA = 'MINOR REASON' THEN 1
        WHEN AA = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS AA,
    
    CASE 
        WHEN HARDWORK = 'NOT A REASON AT ALL' THEN 0
        WHEN HARDWORK = 'MINOR REASON' THEN 1
        WHEN HARDWORK = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS HARDWORK,
    
    CASE 
        WHEN REG = 'NOT A REASON AT ALL' THEN 0
        WHEN REG = 'MINOR REASON' THEN 1
        WHEN REG = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS REG,
    
    CASE 
        WHEN SAVINGS = 'NOT A REASON AT ALL' THEN 0
        WHEN SAVINGS = 'MINOR REASON' THEN 1
        WHEN SAVINGS = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS SAVINGS,
    
    CASE 
        WHEN PROFHIGH = 'NOT A REASON AT ALL' THEN 0
        WHEN PROFHIGH = 'MINOR REASON' THEN 1
        WHEN PROFHIGH = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS PROFHIGH,
    
    CASE 
        WHEN EXECPAY = 'NOT A REASON AT ALL' THEN 0
        WHEN EXECPAY = 'MINOR REASON' THEN 1
        WHEN EXECPAY = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS EXECPAY,
    
    CASE 
        WHEN BUSPROD = 'NOT A REASON AT ALL' THEN 0
        WHEN BUSPROD = 'MINOR REASON' THEN 1
        WHEN BUSPROD = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS BUSPROD,
    
    CASE 
        WHEN TECH = 'NOT A REASON AT ALL' THEN 0
        WHEN TECH = 'MINOR REASON' THEN 1
        WHEN TECH = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS TECH,
    
    CASE 
        WHEN OVERSEAS = 'NOT A REASON AT ALL' THEN 0
        WHEN OVERSEAS = 'MINOR REASON' THEN 1
        WHEN OVERSEAS = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS OVERSEAS,
    
    CASE 
        WHEN DOWNSIZE = 'NOT A REASON AT ALL' THEN 0
        WHEN DOWNSIZE = 'MINOR REASON' THEN 1
        WHEN DOWNSIZE = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS DOWNSIZE,
    
    CASE 
        WHEN COMPEDUC = 'NOT A REASON AT ALL' THEN 0
        WHEN COMPEDUC = 'MINOR REASON' THEN 1
        WHEN COMPEDUC = 'MAJOR REASON' THEN 2
        ELSE NULL 
    END AS COMPEDUC,
    
    CASE 
        WHEN TAXCUT = 'BAD' THEN 0
        WHEN TAXCUT = 'NO DIFFERENCE' THEN 1
        WHEN TAXCUT = 'GOOD' THEN 2
        ELSE NULL 
    END AS TAXCUT,
    
    CASE 
        WHEN WOMENWORK = 'BAD' THEN 0
        WHEN WOMENWORK = 'NO DIFFERENCE' THEN 1
        WHEN WOMENWORK = 'GOOD' THEN 2
        ELSE NULL 
    END AS WOMENWORK,
    
    CASE 
        WHEN TECHGOOD = 'BAD' THEN 0
        WHEN TECHGOOD = 'NO DIFFERENCE' THEN 1
        WHEN TECHGOOD = 'GOOD' THEN 2
        ELSE NULL 
    END AS TECHGOOD,
    
    CASE 
        WHEN TRADEAG = 'BAD' THEN 0
        WHEN TRADEAG = 'NO DIFFERENCE' THEN 1
        WHEN TRADEAG = 'GOOD' THEN 2
        ELSE NULL 
    END AS TRADEAG,
    
    CASE 
        WHEN DOWNGOOD = 'BAD' THEN 0
        WHEN DOWNGOOD = 'NO DIFFERENCE' THEN 1
        WHEN DOWNGOOD = 'GOOD' THEN 2
        ELSE NULL 
    END AS DOWNGOOD,
    
    CASE 
        WHEN TRADEJOB = 'COST JOBS' THEN 0
        WHEN TRADEJOB = 'NO DIFFERENCE' THEN 1
        WHEN TRADEJOB = 'CREATE JOBS' THEN 2
        ELSE NULL 
    END AS TRADEJOB,
    
    CASE 
        WHEN WHYGASSD = 'INCREASE PROFITS' THEN 0
        WHEN WHYGASSD = 'SUPPLY AND DEMAND' THEN 1
        WHEN WHYGASSD = 'BOTH' THEN 1
        WHEN WHYGASSD = 'NEITHER' THEN 0
        ELSE NULL 
    END AS WHYGASSD,
    
    CASE 
        WHEN INCOME20 = 'FALLING BEHIND' THEN 0
        WHEN INCOME20 = 'STAYING EVEN' THEN 1
        WHEN INCOME20 = 'GOING UP' THEN 2
        ELSE NULL 
    END AS INCOME20,
    
    CASE 
        WHEN WAGE20 = 'FALLING BEHIND' THEN 0
        WHEN WAGE20 = 'STAYING EVEN' THEN 1
        WHEN WAGE20 = 'GOING UP' THEN 2
        ELSE NULL 
    END AS WAGE20,
    
    CASE 
        WHEN NEED2EARN = 'ONE WAGE EARNER' THEN 0
        WHEN NEED2EARN = 'TWO WAGE EARNERS' THEN 1
        ELSE NULL 
    END AS NEED2EARN,
    
    CASE 
        WHEN STAN5 = 'FALL' THEN 0
        WHEN STAN5 = 'STAYED SAME' THEN 1
        WHEN STAN5 = 'RISE' THEN 2
        ELSE NULL 
    END AS STAN5,
    
    CASE 
        WHEN CHILDGEN = 'LOWER' THEN 0
        WHEN CHILDGEN = 'STAY SAME' THEN 1
        WHEN CHILDGEN = 'HIGHER' THEN 2
        ELSE NULL 
    END AS CHILDGEN
    
    , case 
        when CHILDSTAN = 'LOWER' then 0
        when CHILDSTAN = 'STAY SAME' then 1
        when CHILDSTAN = 'HIGHER' then 2
        else NULL end as CHILDSTAN
        
    , econ
    , male
    , othrace
    , white
    , asian
    , black
    , jobsecurity
    , yourlast5
    , yournext5
    , income
    , dem
    , rep
    , indep
    , othparty
    , ideology
    , othideol
    , education
FROM
    df_filtered
'''

df_perfect = ps.sqldf(query, locals())

df_perfect

Unnamed: 0,TAXESHIGH,DEFICIT,FORAID,IMMIG,TAXBREAK,INADEDUC,WELFARE,AA,HARDWORK,REG,...,yourlast5,yournext5,income,dem,rep,indep,othparty,ideology,othideol,education
0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,0.0,0.0,2.0,...,0.0,0.0,4.0,1,0,0,0,-1.0,0,5.0
1,2.0,2.0,2.0,2.0,0.0,2.0,1.0,1.0,2.0,2.0,...,2.0,2.0,7.0,0,1,0,0,2.0,0,6.0
2,1.0,2.0,2.0,1.0,1.0,2.0,2.0,0.0,2.0,1.0,...,2.0,1.0,2.0,1,0,0,0,0.0,0,3.0
3,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,...,1.0,1.0,7.0,0,0,1,0,0.0,0,5.0
4,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,...,2.0,1.0,7.0,0,0,1,0,0.0,0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2104,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,2.0,7.0,0,1,0,0,0.0,0,7.0
2105,,,,,,,,,,,...,,,,0,0,0,0,,0,7.0
2106,,,,,,,,,,,...,,,,0,0,0,0,,0,7.0
2107,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,...,1.0,1.0,9.0,0,1,0,0,1.0,0,7.0


## Fazendo as regressões 

In [130]:
### selecionando as variáveis de controle em uma lista

## fazendo uma lista das variaveis de controle para colocar no formula do modelo como 'x1 + x2 + x3 + ...'
### as variáveis serão 'econ', 'male', 'othrace', 'white', 'asian', 'black', 'jobsecurity', 'yourlast5', 'yournext5', 'income', 'dem', 'rep', 'indep', 'othparty', 'ideology', 'othideol', 'education'.

controle_formula = 'econ + male + othrace + white + asian + black + jobsecurity + yourlast5 + yournext5 + income + dem + rep + indep + othparty + ideology + othideol + education'

controle = ['econ', 'male', 'othrace', 'white', 'asian', 'black', 'jobsecurity', 'yourlast5', 'yournext5', 'income', 'dem', 'rep', 'indep', 'othparty', 'ideology', 'othideol', 'education']

In [131]:
df_taxeshigh = df_perfect[['TAXESHIGH'] + controle]
df_taxeshigh.dropna(inplace=True)

#convertendo tudo para numerico
df_taxeshigh = df_taxeshigh.apply(pd.to_numeric)
df_taxeshigh

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_taxeshigh.dropna(inplace=True)


Unnamed: 0,TAXESHIGH,econ,male,othrace,white,asian,black,jobsecurity,yourlast5,yournext5,income,dem,rep,indep,othparty,ideology,othideol,education
0,2.0,0,0,0,1,0,0,1.0,0.0,0.0,4.0,1,0,0,0,-1.0,0,5.0
1,2.0,0,0,0,1,0,0,2.0,2.0,2.0,7.0,0,1,0,0,2.0,0,6.0
2,1.0,0,0,0,1,0,0,3.0,2.0,1.0,2.0,1,0,0,0,0.0,0,3.0
3,2.0,0,0,0,1,0,0,3.0,1.0,1.0,7.0,0,0,1,0,0.0,0,5.0
4,2.0,0,1,0,1,0,0,3.0,2.0,1.0,7.0,0,0,1,0,0.0,0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2101,0.0,1,1,0,1,0,0,3.0,2.0,1.0,9.0,1,0,0,0,-1.0,0,7.0
2103,2.0,1,1,0,1,0,0,0.0,2.0,1.0,7.0,0,1,0,0,2.0,0,7.0
2104,1.0,1,1,0,1,0,0,3.0,2.0,2.0,7.0,0,1,0,0,0.0,0,7.0
2107,1.0,1,1,0,1,0,0,3.0,1.0,1.0,9.0,0,1,0,0,1.0,0,7.0


In [135]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Normalizar as variáveis independentes
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_encoded)

# Adicionar a coluna constante para o intercepto
X_normalized = sm.add_constant(X_normalized)

# Verificar multicolinearidade usando VIF
vif = pd.DataFrame()
vif['Variável'] = X_encoded.columns
vif['VIF'] = [variance_inflation_factor(X_normalized[:, 1:], i) for i in range(X_normalized.shape[1] - 1)]
print(vif)

# Ajustar o modelo novamente com variáveis normalizadas
model_normalized = OrderedModel(y_array, X_normalized, distr='logit')
result_normalized = model_normalized.fit(method='lbfgs')
print(result_normalized.summary())


       Variável        VIF
0          econ   1.582016
1          male   1.187916
2       othrace  22.345778
3         white  45.069141
4         asian   9.032009
5         black  21.529186
6   jobsecurity   1.126149
7     yourlast5   1.560029
8     yournext5   1.266762
9        income   1.740050
10          dem  32.898803
11          rep  30.223380
12        indep  32.594139
13     othparty   4.848688
14     ideology   1.165104
15     othideol        NaN
16    education   1.649298


  return 1 - self.ssr/self.uncentered_tss


ValueError: There should not be a constant in the model

In [141]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Suponha que X e y sejam seus dados e rótulos
X = df_taxeshigh[['econ', 'male', 'othrace', 'asian', 'black', 'jobsecurity', 'yourlast5', 'yournext5', 'income', 'othparty', 'ideology', 'education']]
y = df_taxeshigh['TAXESHIGH']

# Escalonar os dados
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Criar o modelo de regressão logística com regularização L2 (Ridge)
model = LogisticRegression(penalty='l2', solver='lbfgs', multi_class='ovr')
model.fit(X_scaled, y)

# Coeficientes e interceptos
print("Coeficientes:", model.coef_)
print("Interceptos:", model.intercept_)


Coeficientes: [[ 0.33026677  0.01730805  0.03588499  0.08807689 -0.10815955  0.08675445
   0.32308602 -0.21426281  0.10322498 -0.04723854 -0.57759452  0.58945196]
 [-0.13810088  0.17467492 -0.14123617  0.0083797  -0.02112859  0.12249222
   0.06438545  0.10980648 -0.09712685  0.07821066 -0.20116044  0.29456265]
 [-0.25297447 -0.19070159  0.11707994 -0.06377452  0.03664515 -0.16304403
  -0.20966979 -0.01996926  0.07889167 -0.05309472  0.45431201 -0.4633948 ]]
Interceptos: [-2.44538721 -0.65585714  0.0930964 ]




In [137]:
# Remover variáveis com altos VIF
variables_to_remove = ['white', 'dem', 'rep', 'indep']  # Baseado nos VIF altos
X_reduced = X_encoded.drop(columns=variables_to_remove)

# Recalcular o VIF
X_reduced_with_const = sm.add_constant(X_reduced)
vif_reduced = pd.DataFrame()
vif_reduced['Variável'] = X_reduced.columns
vif_reduced['VIF'] = [variance_inflation_factor(X_reduced_with_const.values, i + 1) for i in range(X_reduced_with_const.shape[1] - 1)]
print(vif_reduced)

# Ajustar o modelo novamente com variáveis reduzidas
X_reduced_array = np.asarray(X_reduced_with_const, dtype=float)
model_reduced = OrderedModel(y_array, X_reduced_array, distr='logit')
result_reduced = model_reduced.fit(method='lbfgs')
print(result_reduced.summary())


       Variável       VIF
0          econ  1.554304
1          male  1.184156
2       othrace  1.027898
3         asian  1.016131
4         black  1.059234
5   jobsecurity  1.119285
6     yourlast5  1.558429
7     yournext5  1.259280
8        income  1.737069
9      othparty  1.014238
10     ideology  1.028606
11     othideol       NaN
12    education  1.641895


  return 1 - self.ssr/self.centered_tss


ValueError: There should not be a constant in the model

In [139]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

# Definir a fórmula e os dados
formula = 'TAXESHIGH ~ econ + male + othrace + asian + black + jobsecurity + yourlast5 + yournext5 + income + othparty + ideology + education'
X = df_taxeshigh[['econ', 'male', 'othrace', 'asian', 'black', 'jobsecurity', 'yourlast5', 'yournext5', 'income', 'othparty', 'ideology', 'education']]
y = df_taxeshigh['TAXESHIGH']

# Ajustar o modelo
model = OrderedModel(y, X, distr='logit')

# Ajustar o modelo usando diferentes métodos
try:
    result_lbfgs = model.fit(method='lbfgs')
    print("Resultado com 'lbfgs':")
    print(result_lbfgs.summary())
except Exception as e:
    print(f"Erro ao ajustar com 'lbfgs': {e}")

try:
    result_newton = model.fit(method='newton')
    print("Resultado com 'newton':")
    print(result_newton.summary())
except Exception as e:
    print(f"Erro ao ajustar com 'newton': {e}")

try:
    result_bfgs = model.fit(method='bfgs')
    print("Resultado com 'bfgs':")
    print(result_bfgs.summary())
except Exception as e:
    print(f"Erro ao ajustar com 'bfgs': {e}")


Resultado com 'lbfgs':
                             OrderedModel Results                             
Dep. Variable:              TAXESHIGH   Log-Likelihood:                -1313.1
Model:                   OrderedModel   AIC:                             2654.
Method:            Maximum Likelihood   BIC:                             2729.
Date:                Sun, 15 Sep 2024                                         
Time:                        13:24:23                                         
No. Observations:                1551                                         
Df Residuals:                    1537                                         
Df Model:                          12                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
econ           -0.9535      0.182     -5.240      0.000      -1.310      -0.597
male           -0.3151    

In [133]:
# As regressões serão de logit ordenado, usando formula = 'y ~ x1 + x2 + x3 + ... + xn'

# Função para rodar a regressão logit ordenada
model = sm.MNLogit.from_formula(formula='TAXESHIGH ~ econ + male + othrace + white + asian + black + jobsecurity + yourlast5 + yournext5 + income + dem + rep + indep + othparty + ideology + othideol + education', data=df_taxeshigh)

# Ajustar o modelo usando o método 'bfgs' (como exemplo adicional)
result_bfgs = model.fit(method='bfgs')
print("Resultado com 'bfgs':")
print(result_bfgs.summary())

# Ajustar o modelo usando o método 'lbfgs'
result_lbfgs = model.fit(method='lbfgs')
print("Resultado com 'lbfgs':")
print(result_lbfgs.summary())

# Ajustar o modelo usando o método 'newton'
result_newton = model.fit(method='newton')
print("Resultado com 'newton':")
print(result_newton.summary())




         Current function value: 0.829818
         Iterations: 35
         Function evaluations: 37
         Gradient evaluations: 37
Resultado com 'bfgs':


  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


ValueError: need covariance of parameters for computing (unnormalized) covariances

In [125]:
# Ajustar o modelo usando o método 'lbfgs'
result_lbfgs = model.fit(method='lbfgs')
print("Resultado com 'lbfgs':")
print(result_lbfgs.summary())

# Ajustar o modelo usando o método 'newton'
result_newton = model.fit(method='newton')
print("Resultado com 'newton':")
print(result_newton.summary())

# Ajustar o modelo usando o método 'bfgs' (como exemplo adicional)
result_bfgs = model.fit(method='bfgs')
print("Resultado com 'bfgs':")
print(result_bfgs.summary())


Resultado com 'lbfgs':




ValueError: need covariance of parameters for computing (unnormalized) covariances

In [127]:
# Ajustar o modelo usando um método diferente
result = model.fit(method='newton')  # Experimente 'newton', 'lbfgs', ou 'bfgs'

# Caso o ajuste continue falhando, considere reduzir a complexidade do modelo
result.summary()

Optimization terminated successfully.
         Current function value: 0.832040
         Iterations 99


LinAlgError: Singular matrix

In [126]:
df_taxeshigh.dropna(inplace=True)

# Separar a variável dependente e variáveis independentes
y = df_taxeshigh['TAXESHIGH']
X = df_taxeshigh.drop(columns=['TAXESHIGH'])

print(y)
print(X)

# Criar dummies para variáveis independentes categóricas
X_encoded = pd.get_dummies(X, drop_first=True)

# Garantir que os dados sejam convertidos para NumPy arrays de tipo float
X_array = np.asarray(X_encoded, dtype=float)
y_array = np.asarray(y, dtype=float)

# Criar o modelo logit ordenado sem fórmula
model = OrderedModel(y_array, X_array, distr='logit')

# Ajustar o modelo usando o método 'lbfgs'
result_lbfgs = model.fit(method='lbfgs')
print("Resultado com 'lbfgs':")
print(result_lbfgs.summary())

# Ajustar o modelo usando o método 'newton'
result_newton = model.fit(method='newton')
print("Resultado com 'newton':")
print(result_newton.summary())

# Ajustar o modelo usando o método 'bfgs' (como exemplo adicional)
result_bfgs = model.fit(method='bfgs')
print("Resultado com 'bfgs':")
print(result_bfgs.summary())



0       2.0
1       2.0
2       1.0
3       2.0
4       2.0
       ... 
2101    0.0
2103    2.0
2104    1.0
2107    1.0
2108    1.0
Name: TAXESHIGH, Length: 1551, dtype: float64
      econ  male  othrace  white  asian  black  jobsecurity  yourlast5  \
0        0     0        0      1      0      0          1.0        0.0   
1        0     0        0      1      0      0          2.0        2.0   
2        0     0        0      1      0      0          3.0        2.0   
3        0     0        0      1      0      0          3.0        1.0   
4        0     1        0      1      0      0          3.0        2.0   
...    ...   ...      ...    ...    ...    ...          ...        ...   
2101     1     1        0      1      0      0          3.0        2.0   
2103     1     1        0      1      0      0          0.0        2.0   
2104     1     1        0      1      0      0          3.0        2.0   
2107     1     1        0      1      0      0          3.0        1.0   
2108    

LinAlgError: Singular matrix