## Helper Functions

In [1]:
import subprocess
def c_import(library, elements=None, name=None, always_reimport= True, always_reinstall = True):
  if elements:
    import_str = f'from {library} import {", ".join(elements)}'
    tested_install_var = ", ".join(elements)
    any_not_installed = True if any(e not in globals() for e in elements) else False
  else:
    import_str = f'import {library}'
    tested_install_var = library
    any_not_installed = True if library not in globals() else False
  if name:
    import_str = f'{import_str} as {name}'
    tested_install_var = name
    any_not_installed = True if name not in globals() else False

  def sub_install():
    subprocess.run(f'pip install {library}', shell=True, check=True)
    print(f'Library {library} installed successfully.')

  def sub_import():
    exec(import_str, globals())
    print(f'Library {library} imported successfully. As: \n {import_str}')

  if always_reinstall:
    try:
      sub_install()
      sub_import()
    except subprocess.CalledProcessError:
      print(f'Failed to install {library}.')
    except ImportError as err:
      print(f'After Install. Import error: {err}')

  else:
    if always_reimport == True or any_not_installed == True:
        try:
          sub_import()
        except ImportError as err:
          print(f'Import error: {err}')
          if library in str(err):
            try:
                # Use subprocess to run the pip install command
                sub_install()
                sub_import()
            except subprocess.CalledProcessError:
                print(f'Failed to install {library}.')
    else:
      print(f'"{tested_install_var}" already installed and imported')

## Import Libraries

In [2]:
import_config = {'always_reimport': False, 'always_reinstall': False}
c_import('pandas',name='pd',**import_config)
c_import('csv',**import_config)
c_import('json',**import_config)
c_import('os',**import_config)
c_import('subprocess',**import_config)
c_import('tqdm.notebook',['tqdm'],**import_config)
c_import('concurrent.futures',['ProcessPoolExecutor'],**import_config)
c_import('datetime',**import_config)
c_import('argparse', **import_config)
c_import('codecs', **import_config)
c_import('os', **import_config)
c_import('sys', **import_config)
c_import('numpy',name='np', **import_config)

Library pandas imported successfully. As: 
 import pandas as pd
Library csv imported successfully. As: 
 import csv
Library json imported successfully. As: 
 import json
"os" already installed and imported
"subprocess" already installed and imported
Library tqdm.notebook imported successfully. As: 
 from tqdm.notebook import tqdm
Library concurrent.futures imported successfully. As: 
 from concurrent.futures import ProcessPoolExecutor
Library datetime imported successfully. As: 
 import datetime
Library argparse imported successfully. As: 
 import argparse
Library codecs imported successfully. As: 
 import codecs
"os" already installed and imported
"sys" already installed and imported
Library numpy imported successfully. As: 
 import numpy as np


## Directories

In [3]:
input_dir = "../input/reddit_input/"
output_dir = "../output/reddit_output/"
# split_files_dir = input_dir+'split_data/' #split JSONs output
liwcdic_file_dir = '../input/Spanish_LIWC2007_Dictionary.dic'
# Specify the folder path where you want to search for .zst files
zst_files_dir = '/Volumes/Drakôn Kholkikos - 2TB/Tesis-Grado/input/reddit_input/'

## Model Functions

### LIWC Function Setup

In [4]:
class liwc:

    def load_liwc_dict(self, liwcdic_file):
        file_content = codecs.open(liwcdic_file, "r", "utf-8").read()
        cate_text = file_content[file_content.find("%")+1:file_content[1:].find("%")].strip()
        for line in cate_text.split("\n"):
            self.liwc_cate_name_by_number[int(line.strip().split("\t")[0])] = line.strip().split("\t")[1]

        dict_text = file_content[file_content[1:].find("%")+2:].strip()
        for line in dict_text.split("\n"):
            self.liwc_cate_number_by_word[line.strip().split("\t")[0]] = set([int(item) for item in line.strip().split("\t")[1:]])

    def __init__(self, liwcdic_file=liwcdic_file_dir):

        self.liwc_category_names = ["WC",'Funct', 'TotPron', 'PronPer', 'Yo', 'Nosotro', 'TuUtd', 'ElElla', 'Ellos', 'PronImp', 'Articulo', 'Verbos', 'VerbAux', 'Pasado', 'Present', 'Futuro', 'Adverb', 'Prepos', 'Conjunc', 'Negacio', 'Cuantif', 'Numeros', 'Maldec', 'verbYO', 'verbTU', 'verbNOS', 'verbosEL', 'verbELLOS', 'Subjuntiv', 'VosUtds', 'formal', 'informal', 'verbVos', 'Social', 'Familia', 'Amigos', 'Humanos', 'Afect', 'EmoPos', 'EmoNeg', 'Ansiedad', 'Enfado', 'Triste', 'MecCog', 'Insight', 'Causa', 'Discrep', 'Tentat', 'Certeza', 'Inhib', 'Incl', 'Excl', 'Percept', 'Ver', 'Oir', 'Sentir', 'Biolog', 'Cuerpo', 'Salud', 'Sexual', 'Ingerir', 'Relativ', 'Movim', 'Espacio', 'Tiempo', 'Trabajo', 'Logro', 'Placer', 'Hogar', 'Dinero', 'Relig', 'Muerte', 'Asentir', 'NoFluen', 'Relleno']
        self.liwc_cate_name_by_number = {}
        self.liwc_cate_number_by_word = {}

        if os.path.exists(liwcdic_file) == False:

            sys.exit()
        else:
            self.load_liwc_dict(liwcdic_file)

    def getLIWCCount(self, text):
        count_by_categories = {"WC":0,'Funct': 0, 'TotPron': 0, 'PronPer': 0, 'Yo': 0, 'Nosotro': 0, 'TuUtd': 0, 'ElElla': 0, 'Ellos': 0, 'PronImp': 0, 'Articulo': 0, 'Verbos': 0, 'VerbAux': 0, 'Pasado': 0, 'Present': 0, 'Futuro': 0, 'Adverb': 0, 'Prepos': 0, 'Conjunc': 0, 'Negacio': 0, 'Cuantif': 0, 'Numeros': 0, 'Maldec': 0, 'verbYO': 0, 'verbTU': 0, 'verbNOS': 0, 'verbosEL': 0, 'verbELLOS': 0, 'Subjuntiv': 0, 'VosUtds': 0, 'formal': 0, 'informal': 0, 'verbVos': 0, 'Social': 0, 'Familia': 0, 'Amigos': 0, 'Humanos': 0, 'Afect': 0, 'EmoPos': 0, 'EmoNeg': 0, 'Ansiedad': 0, 'Enfado': 0, 'Triste': 0, 'MecCog': 0, 'Insight': 0, 'Causa': 0, 'Discrep': 0, 'Tentat': 0, 'Certeza': 0, 'Inhib': 0, 'Incl': 0, 'Excl': 0, 'Percept': 0, 'Ver': 0, 'Oir': 0, 'Sentir': 0, 'Biolog': 0, 'Cuerpo': 0, 'Salud': 0, 'Sexual': 0, 'Ingerir': 0, 'Relativ': 0, 'Movim': 0, 'Espacio': 0, 'Tiempo': 0, 'Trabajo': 0, 'Logro': 0, 'Placer': 0, 'Hogar': 0, 'Dinero': 0, 'Relig': 0, 'Muerte': 0, 'Asentir': 0, 'NoFluen': 0, 'Relleno': 0}

        count_by_categories["WC"] = len(text.split())

        for word in text.split():

            cate_numbers_word_belongs = set([])
            if word in self.liwc_cate_number_by_word:
                cate_numbers_word_belongs = self.liwc_cate_number_by_word[word]

            else:

                #liwc words have *. eg: balcon*
                word = word[:-1]
                while len(word) > 0:
                    if (word+"*") in self.liwc_cate_number_by_word:
                        cate_numbers_word_belongs = self.liwc_cate_number_by_word[word+"*"]
                        break
                    else:
                        word = word[:-1]

            for num in cate_numbers_word_belongs:
                count_by_categories[self.liwc_cate_name_by_number[num]] += 1

        return count_by_categories

## Setup

##### Testing if .csv is stored correctly

In [9]:
# Specify the path to your CSV file
csv_file_path = '../input/yt_input/Ministerio_Turismo_Deportes-transcripts.csv'


# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path,sep=',', escapechar='\\')

with pd.option_context('display.max_columns', None, 'display.max_rows', 50):
  display(df.head(50))

Unnamed: 0.1,Unnamed: 0,video_id,published_date,channel_name,channel_category,channel_id,is_autogenerated,transcription
0,0,8xQb_cW66go,2023-10-27,Ministerio de Turismo y deportes,Government Ministries,UC3m5QCp-mE8PpdaA1zUrTuA,True,[Música] la observación de aves es una activid...
1,1,EJVdlvuEnDc,2023-10-03,Ministerio de Turismo y deportes,Government Ministries,UC3m5QCp-mE8PpdaA1zUrTuA,True,[Música] llegamos a trevelin hace 22 años y co...
2,2,yxyWA7s6hb4,2023-09-20,Ministerio de Turismo y deportes,Government Ministries,UC3m5QCp-mE8PpdaA1zUrTuA,True,la naturaleza me dio todo es una manera de man...
3,3,0m5xNv3q4W4,2023-09-13,Ministerio de Turismo y deportes,Government Ministries,UC3m5QCp-mE8PpdaA1zUrTuA,True,la puna significa libertad es un lugar donde u...
4,4,0soZLnZXHOA,2023-08-31,Ministerio de Turismo y deportes,Government Ministries,UC3m5QCp-mE8PpdaA1zUrTuA,True,llegó la quinta edición de previaje armar las ...
5,5,2XNpB79fx9k,2023-08-28,Ministerio de Turismo y deportes,Government Ministries,UC3m5QCp-mE8PpdaA1zUrTuA,True,sentir la fuerza del río es sentir que todos l...
6,6,PlMjn5upzto,2023-07-04,Ministerio de Turismo y deportes,Government Ministries,UC3m5QCp-mE8PpdaA1zUrTuA,True,tenés ganas de conocer todo esto entra a la ru...
7,7,Ps0p8zvEEGs,2023-04-28,Ministerio de Turismo y deportes,Government Ministries,UC3m5QCp-mE8PpdaA1zUrTuA,True,exitosa edición de previaje un millón de turis...
8,8,VkrghSMKDPQ,2023-04-14,Ministerio de Turismo y deportes,Government Ministries,UC3m5QCp-mE8PpdaA1zUrTuA,True,6 millones de personas lo usaron viajando por ...
9,9,DE_jHNj4Yrw,2023-03-27,Ministerio de Turismo y deportes,Government Ministries,UC3m5QCp-mE8PpdaA1zUrTuA,True,este circuito es un circuito histórico cultura...


## LIWC

In [10]:
yt_files_titles = [
  'Ministerio_Ambiente_Desarrollo_Sostenible-transcripts',
  'Ministerio_Ciencia_Tecnología_Innovación-transcripts',
  'Ministerio_Cultura-transcripts',
  'Ministerio_Defensa-transcripts',
  'Ministerio_Desarrollo_Social-transcripts',
  'Ministerio_Economía-transcripts',
  'Ministerio_Educación-transcripts',
  'Ministerio_Interior-transcripts',
  'Ministerio_Justicia_Derechos_Humanos-transcripts',
  'Ministerio_Relaciones_Exteriores_Comercio_Internacional_Culto-transcripts',
  'Ministerio_Salud-transcripts',
  'Ministerio_Seguridad-transcripts',
  'Ministerio_Trabajo_Empleo_Seguridad_Social-transcripts',
  'Ministerio_Transporte-transcripts',
  'Ministerio_Turismo_Deportes-transcripts',
  'Ciudad_Autonoma_Buenos_Aires-transcripts',
  'Provincia_Buenos_Aires-transcripts',
  'Provincia_Catamarca-transcripts',
  'Provincia_Chubut-transcripts',
  'Provincia_Córdoba-transcripts',
  'Provincia_Corrientes-transcripts',
  'Diputados-transcripts',
  'Senado-transcripts',
]

In [12]:
def remove_invalids(col='', df=''):
  invalid_options = [np.nan, 'nan', None, 0, '0', 'NaN', '[deleted]', '[removed]']
  df = df.dropna(subset=[col], how='all')
  df = df[~df[col].isin(invalid_options)]
  return df

for source in yt_files_titles:

  source_dir = f'../input/yt_input/{source}.csv'
  source_output_dir = f'../output/reddit_output/filtered_tables_LIWC_count/{source}-liwc_output.csv'

  df_output_table = pd.read_csv(source_dir,sep=',')

  df_output_table = remove_invalids('transcription', df_output_table)

  # if 'title' in df_output_table.columns: #if its a submission (has title)
  #     #remove invalid/incomplete obs
  #     df_output_table = remove_invalids('title', df_output_table)
  #     #join title and text (yes fillna just in case something passed previous cleanup)
  #     df_output_table['text'] = df_output_table['title'].fillna('') + '\n' + df_output_table['text'].fillna('')
  #     #now drop it
  #     df_output_table = df_output_table.drop(columns=['title'])
      
  tqdm.pandas(desc = f'Computing {source} LIWC')
  # Assuming you have the LIWC counts as a Series

  '''
  liwc_counts = df_output_table['text'].progress_apply(liwc().getLIWCCount)

  # Create new DataFrame from the LIWC counts
  liwc_df = pd.DataFrame(liwc_counts.tolist())
  
  # Concatenate the new DataFrame with the original DataFrame
  df_output_table = pd.concat([df_output_table, liwc_df], axis=1)

  #Now your DataFrame will have columns for each LIWC category
  df_output_table.to_csv(source_output_dir, index=False, sep=',', quoting=csv.QUOTE_ALL, escapechar='\\')
  '''

In [14]:
import re

LIWC_ALL = []
LIWC_dummy = liwc().getLIWCCount('')
HEADERS = list(LIWC_dummy.keys())
HEADERS.sort()
#print(HEADERS)
liwc_vector = []
for head in HEADERS:
    liwc_vector.append(head)

LIWC_ALL.append(liwc_vector)    
for index, row in tqdm(df_output_table.iterrows()):
    #print('Row Index:', index)
    
    text = row['transcription'].lower().replace('\n',' ')
    #print(text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r' +',' ',text)
    LIWC_raw = liwc().getLIWCCount(text)
    LIWC_norm = {}
    for item in LIWC_raw:
        if item != 'WC':    #==
            LIWC_norm[item] = LIWC_raw[item]/LIWC_raw['WC']
        else:
            LIWC_norm['WC'] = LIWC_raw['WC']
    #print(LIWC_norm)
    liwc_vector = []    
    for head in HEADERS:
        liwc_vector.append(LIWC_norm[head])

    LIWC_ALL.append(liwc_vector)
    # You can access specific column values using row['Column_Name']
    #print()
  

0it [00:00, ?it/s]

In [16]:
LIWC_ALL = []

LIWC_dummy = liwc().getLIWCCount('')
HEADERS = list(LIWC_dummy.keys())
HEADERS.sort()


liwc_vector = []

for head in HEADERS:
    liwc_vector.append(head)

LIWC_ALL.append(liwc_vector)

data_list = []

for index, row in tqdm(df_output_table.iterrows()):
    text = row['transcription'].lower().replace('\n', ' ')
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r' +', ' ', text)
    #TIDphto +append    
    LIWC_raw = liwc().getLIWCCount(text)
    LIWC_norm = {}
    
    for item in LIWC_raw:
        if item != 'WC':
            LIWC_norm[item] = LIWC_raw[item] / LIWC_raw['WC']
        else:
            LIWC_norm['WC'] = LIWC_raw['WC']
    
    liwc_vector = {}
    
    for head in HEADERS:
        liwc_vector[head] = LIWC_norm[head]
    
    data_list.append(liwc_vector)

# Create a DataFrame from the list of dictionaries
liwc_df = pd.DataFrame(data_list)


0it [00:00, ?it/s]

In [17]:
# Print the resulting DataFrame
display(liwc_df)


Unnamed: 0,Adverb,Afect,Amigos,Ansiedad,Articulo,Asentir,Biolog,Causa,Certeza,Conjunc,...,WC,Yo,formal,informal,verbELLOS,verbNOS,verbTU,verbVos,verbYO,verbosEL
0,0.068657,0.014925,0.005970,0.000000,0.122388,0.000000,0.002985,0.011940,0.008955,0.074627,...,335,0.002985,0.000000,0.000000,0.008955,0.002985,0.008955,0.000000,0.002985,0.029851
1,0.076485,0.038242,0.003255,0.000814,0.100895,0.005696,0.008137,0.017087,0.012205,0.056143,...,1229,0.010578,0.001627,0.000814,0.008950,0.018714,0.000000,0.000000,0.015460,0.036615
2,0.080972,0.054656,0.000000,0.000000,0.117409,0.016194,0.008097,0.016194,0.024291,0.076923,...,494,0.002024,0.000000,0.000000,0.016194,0.018219,0.000000,0.000000,0.016194,0.038462
3,0.073171,0.039634,0.000000,0.000000,0.118902,0.000000,0.009146,0.024390,0.018293,0.057927,...,328,0.000000,0.003049,0.000000,0.006098,0.015244,0.000000,0.000000,0.039634,0.051829
4,0.059979,0.037229,0.001034,0.000000,0.113754,0.008273,0.001034,0.018614,0.009307,0.059979,...,967,0.007239,0.002068,0.008273,0.010341,0.013444,0.003102,0.001034,0.028956,0.058945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14074,0.064506,0.037734,0.002224,0.000874,0.112965,0.002939,0.015968,0.020019,0.018351,0.059263,...,12588,0.008818,0.001986,0.004369,0.010963,0.015253,0.004131,0.000000,0.016762,0.045917
14075,0.046782,0.025770,0.001322,0.001454,0.124356,0.002511,0.015462,0.014933,0.014669,0.050747,...,7567,0.007797,0.000132,0.005022,0.008722,0.011894,0.003568,0.000132,0.014272,0.036606
14076,0.046746,0.020893,0.000902,0.000752,0.128964,0.002255,0.008718,0.012325,0.014129,0.053510,...,6653,0.003457,0.000000,0.004058,0.014580,0.009920,0.003758,0.000301,0.009920,0.029160
14077,0.079562,0.037295,0.002984,0.000497,0.074590,0.007459,0.006962,0.014918,0.014421,0.076082,...,2011,0.013923,0.002486,0.002486,0.009945,0.015415,0.001989,0.000497,0.015415,0.033814


In [18]:
print(len(liwc_df))
print(len(df_output_table))
df_output_table = df_output_table.reset_index(drop=True)
# LIWC_df = pd.DataFrame(LIWC_ALL, columns=HEADERS)
merge_df = pd.concat([df_output_table, liwc_df], axis=1)


14079
14079


In [19]:
print(len(merge_df))

# df_output_table = df_output_table.reset_index(drop=True)
# df_output_table.tail(50)


14079


In [25]:
merge_df.drop(columns=['Unnamed: 0'], inplace=True)

with pd.option_context('display.max_columns', None):
    display(merge_df)

Unnamed: 0,video_id,published_date,channel_name,channel_category,channel_id,is_autogenerated,transcription,Adverb,Afect,Amigos,Ansiedad,Articulo,Asentir,Biolog,Causa,Certeza,Conjunc,Cuantif,Cuerpo,Dinero,Discrep,ElElla,Ellos,EmoNeg,EmoPos,Enfado,Espacio,Excl,Familia,Funct,Futuro,Hogar,Humanos,Incl,Ingerir,Inhib,Insight,Logro,Maldec,MecCog,Movim,Muerte,Negacio,NoFluen,Nosotro,Numeros,Oir,Pasado,Percept,Placer,Prepos,Present,PronImp,PronPer,Relativ,Relig,Relleno,Salud,Sentir,Sexual,Social,Subjuntiv,Tentat,Tiempo,TotPron,Trabajo,Triste,TuUtd,Ver,VerbAux,Verbos,VosUtds,WC,Yo,formal,informal,verbELLOS,verbNOS,verbTU,verbVos,verbYO,verbosEL
0,yZSC3X-KyOs,2023-11-03,Senado,Repblic Goverment Argentina,UCStm2uvQKowSXrnCzTWAn6Q,True,[Música] el 19 de noviembre las y los argentin...,0.068657,0.014925,0.005970,0.000000,0.122388,0.000000,0.002985,0.011940,0.008955,0.074627,0.026866,0.000000,0.002985,0.002985,0.053731,0.017910,0.002985,0.011940,0.002985,0.065672,0.038806,0.000000,0.510448,0.002985,0.000000,0.000000,0.098507,0.002985,0.000000,0.020896,0.011940,0.000000,0.188060,0.017910,0.000000,0.026866,0.000000,0.002985,0.023881,0.017910,0.000000,0.023881,0.002985,0.143284,0.077612,0.050746,0.056716,0.107463,0.002985,0.0,0.000000,0.000000,0.000000,0.062687,0.014925,0.026866,0.026866,0.107463,0.008955,0.000000,0.000000,0.002985,0.005970,0.089552,0.000000,335,0.002985,0.000000,0.000000,0.008955,0.002985,0.008955,0.000000,0.002985,0.029851
1,Kt9cCOWVJRg,2023-11-02,Senado,Repblic Goverment Argentina,UCStm2uvQKowSXrnCzTWAn6Q,True,aquí estamos chicas en el salón de los pasos p...,0.076485,0.038242,0.003255,0.000814,0.100895,0.005696,0.008137,0.017087,0.012205,0.056143,0.017087,0.000814,0.008137,0.009764,0.069976,0.015460,0.001627,0.034988,0.000000,0.059398,0.016273,0.001627,0.502034,0.000000,0.002441,0.005696,0.074044,0.007323,0.000814,0.020342,0.024410,0.000000,0.197722,0.017087,0.000000,0.017901,0.013019,0.019528,0.010578,0.008137,0.013019,0.024410,0.006509,0.157038,0.087063,0.056143,0.084622,0.100895,0.004068,0.0,0.000000,0.000000,0.000000,0.117982,0.010578,0.022783,0.028478,0.140765,0.023596,0.000814,0.004882,0.005696,0.019528,0.104150,0.001627,1229,0.010578,0.001627,0.000814,0.008950,0.018714,0.000000,0.000000,0.015460,0.036615
2,06Wo-DE_Yh8,2023-11-02,Senado,Repblic Goverment Argentina,UCStm2uvQKowSXrnCzTWAn6Q,True,senadora por estar en resumen Bueno un día eh ...,0.080972,0.054656,0.000000,0.000000,0.117409,0.016194,0.008097,0.016194,0.024291,0.076923,0.020243,0.002024,0.002024,0.016194,0.070850,0.012146,0.004049,0.054656,0.004049,0.028340,0.026316,0.004049,0.528340,0.000000,0.006073,0.002024,0.066802,0.006073,0.002024,0.044534,0.018219,0.000000,0.275304,0.016194,0.004049,0.026316,0.012146,0.020243,0.004049,0.006073,0.014170,0.024291,0.004049,0.119433,0.099190,0.089069,0.072874,0.066802,0.000000,0.0,0.002024,0.006073,0.000000,0.089069,0.016194,0.026316,0.022267,0.161943,0.012146,0.000000,0.000000,0.004049,0.004049,0.115385,0.000000,494,0.002024,0.000000,0.000000,0.016194,0.018219,0.000000,0.000000,0.016194,0.038462
3,YRH1nTt0uQw,2023-11-02,Senado,Repblic Goverment Argentina,UCStm2uvQKowSXrnCzTWAn6Q,True,un senador aquí estamos no sé si si nos ven pe...,0.073171,0.039634,0.000000,0.000000,0.118902,0.000000,0.009146,0.024390,0.018293,0.057927,0.030488,0.003049,0.006098,0.015244,0.079268,0.021341,0.000000,0.033537,0.000000,0.057927,0.006098,0.000000,0.530488,0.000000,0.009146,0.000000,0.070122,0.006098,0.000000,0.042683,0.033537,0.000000,0.243902,0.015244,0.000000,0.012195,0.000000,0.012195,0.012195,0.000000,0.021341,0.027439,0.009146,0.143293,0.097561,0.073171,0.082317,0.091463,0.003049,0.0,0.000000,0.003049,0.000000,0.091463,0.027439,0.042683,0.024390,0.155488,0.021341,0.000000,0.000000,0.012195,0.012195,0.125000,0.003049,328,0.000000,0.003049,0.000000,0.006098,0.015244,0.000000,0.000000,0.039634,0.051829
4,rdgxs_EXgv0,2023-11-02,Senado,Repblic Goverment Argentina,UCStm2uvQKowSXrnCzTWAn6Q,True,guz que es del colectivo hipotecados uba autoc...,0.059979,0.037229,0.001034,0.000000,0.113754,0.008273,0.001034,0.018614,0.009307,0.059979,0.018614,0.000000,0.034126,0.012410,0.055843,0.011375,0.002068,0.037229,0.000000,0.036194,0.020683,0.002068,0.513961,0.000000,0.009307,0.002068,0.071355,0.001034,0.000000,0.011375,0.018614,0.001034,0.193382,0.023785,0.000000,0.020683,0.007239,0.020683,0.022751,0.012410,0.031024,0.015512,0.006205,0.129266,0.103413,0.074457,0.074457,0.095140,0.000000,0.0,0.000000,0.000000,0.000000,0.102378,0.029990,0.024819,0.037229,0.148914,0.020683,0.001034,0.010341,0.001034,0.021717,0.134436,0.003102,967,0.007239,0.002068,0.008273,0.010341,0.013444,0.003102,0.001034,0.028956,0.058945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14074,IzRYTwr9N5U,2013-08-15,Senado,Repblic Goverment Argentina,UCStm2uvQKowSXrnCzTWAn6Q,True,[Música] o muy buenas tardes bienvenidos a un ...,0.064506,0.037734,0.002224,0.000874,0.112965,0.002939,0.015968,0.020019,0.018351,0.059263,0.020655,0.003257,0.003495,0.013028,0.060534,0.016206,0.006276,0.031538,0.004369,0.048379,0.015650,0.002145,0.499047,0.000477,0.002542,0.009056,0.078090,0.008024,0.001748,0.024865,0.021290,0.000318,0.215920,0.024468,0.000556,0.015570,0.001033,0.010248,0.013664,0.015729,0.019463,0.033206,0.011122,0.137909,0.114156,0.064188,0.077296,0.095329,0.003893,0.0,0.002224,0.002224,0.004528,0.093581,0.015729,0.020099,0.027486,0.141484,0.025500,0.000397,0.006991,0.006673,0.014697,0.137671,0.002224,12588,0.008818,0.001986,0.004369,0.010963,0.015253,0.004131,0.000000,0.016762,0.045917
14075,s1tGVEGQw_E,2013-08-14,Senado,Repblic Goverment Argentina,UCStm2uvQKowSXrnCzTWAn6Q,True,gracias trabajando dando todo lo mismo es un h...,0.046782,0.025770,0.001322,0.001454,0.124356,0.002511,0.015462,0.014933,0.014669,0.050747,0.012290,0.004361,0.002775,0.011894,0.059337,0.014801,0.005947,0.019294,0.003832,0.055504,0.009779,0.002643,0.486190,0.000793,0.004097,0.006872,0.082463,0.007797,0.003436,0.028413,0.017180,0.000000,0.194265,0.027223,0.000529,0.012158,0.000793,0.010176,0.014272,0.007929,0.023523,0.026827,0.011762,0.157658,0.092771,0.055768,0.073609,0.100304,0.001454,0.0,0.001850,0.002511,0.003040,0.086824,0.013876,0.018501,0.024184,0.129378,0.026034,0.000793,0.005947,0.007004,0.012026,0.120655,0.000925,7567,0.007797,0.000132,0.005022,0.008722,0.011894,0.003568,0.000132,0.014272,0.036606
14076,0gop-6EyrV0,2013-08-07,Senado,Repblic Goverment Argentina,UCStm2uvQKowSXrnCzTWAn6Q,True,no trabajando todos los habitantes y cuanto a ...,0.046746,0.020893,0.000902,0.000752,0.128964,0.002255,0.008718,0.012325,0.014129,0.053510,0.009770,0.002255,0.006012,0.008718,0.067639,0.021043,0.004810,0.014730,0.002706,0.055464,0.013678,0.002104,0.488952,0.003607,0.005561,0.004509,0.086427,0.004960,0.003457,0.031865,0.018037,0.000000,0.190741,0.021344,0.001202,0.014881,0.000601,0.004660,0.016233,0.009018,0.014730,0.025252,0.012325,0.156771,0.088231,0.053359,0.076507,0.097851,0.004960,0.0,0.000902,0.001804,0.001653,0.079363,0.013377,0.020442,0.028258,0.129866,0.027055,0.000752,0.006313,0.007966,0.014881,0.109875,0.000451,6653,0.003457,0.000000,0.004058,0.014580,0.009920,0.003758,0.000301,0.009920,0.029160
14077,1gApCLOYw_s,2013-08-07,Senado,Repblic Goverment Argentina,UCStm2uvQKowSXrnCzTWAn6Q,True,quiero presentar a dos músicos que han tocado ...,0.079562,0.037295,0.002984,0.000497,0.074590,0.007459,0.006962,0.014918,0.014421,0.076082,0.025858,0.002486,0.002486,0.008951,0.034809,0.011437,0.004973,0.032322,0.001989,0.047240,0.019891,0.000995,0.448036,0.000497,0.000497,0.001492,0.092989,0.001492,0.000000,0.019393,0.004475,0.000000,0.202387,0.015912,0.000497,0.030333,0.016907,0.004973,0.012929,0.140229,0.009448,0.154649,0.007956,0.094480,0.093486,0.055694,0.056688,0.088016,0.000000,0.0,0.001492,0.004475,0.002486,0.058180,0.006962,0.018399,0.026852,0.112382,0.004475,0.000497,0.002486,0.008951,0.017902,0.105420,0.004475,2011,0.013923,0.002486,0.002486,0.009945,0.015415,0.001989,0.000497,0.015415,0.033814


In [27]:
print_df = merge_df.copy()
print_df.drop(columns=['transcription','is_autogenerated','channel_id'], inplace=True)

print_df.to_csv('--yt-tmp_liwc.csv', index=False, sep=',')

In [36]:
# Assuming 'df_copy' is your DataFrame
count_non_zero_WC = (print_df['WC'] != 0).sum()
n_rows= len(print_df)

print(f'Number of rows with WC different from 0: {count_non_zero_WC} of {n_rows} ({round(count_non_zero_WC/n_rows, 4)*100}%)')

Number of rows with WC different from 0: 14079 of 14079 (100.0%)


In [150]:
# for source in tqdm(reddit_files_titles):
#   source_dir = f'../output/reddit_output/filtered_tables_LIWC_count/{source}-liwc_output.csv'
#   output_dir = f'../output/reddit_output/daily_LIWC_averages/{source}-liwc_output.csv'

df = print_df #pd.read_csv('/Volumes/Drakôn Kholkikos - 2TB/Tesis-Grado/programs/---tmp_liwc.csv',sep=',')
LIWC_keys = list(
  df.columns[
    list(df.columns).index('day') + 1 #Gets the last column before LIWC count
    :  ]
  )

for key in LIWC_keys:
    df[key] = pd.to_numeric(df[key], errors='coerce')

df.head(20)

result_df = df.groupby(['year', 'month', 'day'])[LIWC_keys].mean().reset_index()

# Calculate the count of rows for each date
count_df = df.groupby(['year', 'month', 'day'])['id'].count().reset_index()
count_df.rename(columns={'id': 'Obs'}, inplace=True)

# Merge the average data with the count data
result_df = pd.merge(result_df, count_df, on=['year', 'month', 'day'])

# Save the results to a new DataFrame
# result_df.to_csv('averages_by_date_with_count.csv', index=False)
result_df.to_csv('/Volumes/Drakôn Kholkikos - 2TB/Tesis-Grado/programs/---grouped-tmp_liwc.csv', index=False, sep=',', quoting=csv.QUOTE_ALL, escapechar='\\')

In [151]:
from IPython.display import display

with pd.option_context('display.max_columns', None):
    display(result_df.head(10))


Unnamed: 0,year,month,day,Adverb,Afect,Amigos,Ansiedad,Articulo,Asentir,Biolog,Causa,Certeza,Conjunc,Cuantif,Cuerpo,Dinero,Discrep,ElElla,Ellos,EmoNeg,EmoPos,Enfado,Espacio,Excl,Familia,Funct,Futuro,Hogar,Humanos,Incl,Ingerir,Inhib,Insight,Logro,Maldec,MecCog,Movim,Muerte,Negacio,NoFluen,Nosotro,Numeros,Oir,Pasado,Percept,Placer,Prepos,Present,PronImp,PronPer,Relativ,Relig,Relleno,Salud,Sentir,Sexual,Social,Subjuntiv,Tentat,Tiempo,TotPron,Trabajo,Triste,TuUtd,Ver,VerbAux,Verbos,VosUtds,WC,Yo,formal,informal,verbELLOS,verbNOS,verbTU,verbVos,verbYO,verbosEL,Obs
0,2018,1,2,0.051348,0.021231,0.000797,0.000797,0.087747,0.003554,0.017085,0.016335,0.016516,0.063601,0.028136,0.001161,0.012617,0.018997,0.059911,0.016288,0.00925,0.014921,0.005126,0.033148,0.026563,0.001595,0.494184,0.0,0.007038,0.001595,0.056908,0.01057,0.008928,0.041031,0.022412,0.0,0.260774,0.016494,0.002392,0.024968,0.0,0.0,0.013031,0.008908,0.011161,0.033876,0.00868,0.139103,0.12224,0.08149,0.082896,0.064743,0.0,0.0,0.004351,0.0,0.004329,0.075294,0.015354,0.053148,0.022415,0.164385,0.038611,0.0,0.005946,0.007085,0.011437,0.142082,0.000797,286.333333,0.019864,0.0,0.001161,0.016335,0.001595,0.001161,0.0,0.023918,0.035127,3
1,2018,1,4,0.053924,0.024355,0.00576,0.004717,0.079868,0.002358,0.021724,0.019911,0.008118,0.082996,0.045036,0.002358,0.002358,0.00576,0.055513,0.039276,0.012835,0.009161,0.008118,0.046352,0.047122,0.0,0.500658,0.0,0.004717,0.002358,0.050298,0.019365,0.004717,0.03143,0.010477,0.0,0.289854,0.024355,0.0,0.028799,0.0,0.0,0.017007,0.01728,0.020954,0.049207,0.008118,0.101094,0.139279,0.115197,0.068348,0.086173,0.0,0.0,0.0,0.0,0.0,0.100821,0.021724,0.070434,0.024628,0.183545,0.004717,0.0,0.004717,0.00576,0.015194,0.17071,0.0,179.5,0.008118,0.0,0.004717,0.010204,0.002358,0.0,0.0,0.024082,0.063904,2
2,2018,1,6,0.020408,0.040816,0.0,0.0,0.040816,0.0,0.020408,0.0,0.0,0.061224,0.0,0.0,0.0,0.0,0.040816,0.020408,0.020408,0.020408,0.020408,0.020408,0.020408,0.0,0.469388,0.0,0.0,0.0,0.081633,0.0,0.0,0.061224,0.020408,0.0,0.244898,0.0,0.0,0.0,0.0,0.020408,0.0,0.0,0.020408,0.040816,0.020408,0.142857,0.142857,0.122449,0.061224,0.040816,0.0,0.0,0.0,0.020408,0.020408,0.061224,0.020408,0.061224,0.020408,0.183673,0.0,0.0,0.0,0.0,0.020408,0.142857,0.0,49.0,0.020408,0.0,0.0,0.020408,0.0,0.020408,0.0,0.020408,0.0,1
3,2018,1,7,0.046875,0.05125,0.0,0.0,0.1025,0.0,0.015625,0.0,0.035625,0.046875,0.05125,0.0,0.0,0.0,0.1025,0.05125,0.0,0.05125,0.0,0.055625,0.015625,0.0,0.525625,0.0,0.0,0.0,0.055625,0.015625,0.0,0.086875,0.0,0.0,0.311875,0.0,0.0,0.015625,0.0,0.0,0.02,0.0,0.0,0.066875,0.0,0.106875,0.118125,0.11125,0.1025,0.07125,0.0,0.0,0.0,0.0,0.0,0.09125,0.0,0.035625,0.015625,0.21375,0.0,0.0,0.0,0.0,0.0,0.138125,0.0,28.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.055625,2
4,2018,1,8,0.029762,0.02381,0.0,0.0,0.136905,0.0,0.005952,0.005952,0.011905,0.053571,0.011905,0.0,0.0,0.011905,0.029762,0.011905,0.0,0.029762,0.0,0.017857,0.029762,0.0,0.375,0.0,0.0,0.005952,0.041667,0.005952,0.0,0.017857,0.017857,0.005952,0.14881,0.0,0.0,0.011905,0.0,0.017857,0.011905,0.0,0.011905,0.017857,0.0,0.071429,0.089286,0.047619,0.035714,0.041667,0.005952,0.0,0.0,0.005952,0.0,0.095238,0.005952,0.017857,0.02381,0.083333,0.011905,0.0,0.005952,0.005952,0.041667,0.107143,0.0,168.0,0.005952,0.0,0.0,0.005952,0.017857,0.0,0.0,0.017857,0.035714,1
5,2018,1,9,0.042553,0.042553,0.0,0.0,0.12766,0.0,0.0,0.021277,0.0,0.042553,0.021277,0.0,0.0,0.0,0.06383,0.021277,0.0,0.06383,0.0,0.042553,0.042553,0.0,0.489362,0.0,0.0,0.0,0.021277,0.0,0.0,0.06383,0.0,0.0,0.191489,0.021277,0.0,0.0,0.0,0.0,0.042553,0.0,0.0,0.042553,0.021277,0.170213,0.085106,0.106383,0.085106,0.085106,0.0,0.0,0.0,0.021277,0.0,0.042553,0.0,0.06383,0.021277,0.191489,0.042553,0.0,0.0,0.021277,0.0,0.085106,0.021277,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021277,1
6,2018,1,11,0.1,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.55,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.3,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.1,0.15,0.05,0.0,0.05,0.0,0.0,0.0,0.0,0.1,0.0,0.05,0.0,0.2,0.0,0.0,0.05,0.0,0.0,0.1,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7,2018,1,12,0.034975,0.077362,0.0,0.0,0.122744,0.01018,0.017992,0.019456,0.009513,0.036569,0.01031,0.0,0.0,0.001701,0.032371,0.017089,0.017886,0.059475,0.014485,0.057799,0.012808,0.002604,0.457541,0.0,0.0,0.004305,0.034975,0.0,0.001701,0.040744,0.003401,0.001701,0.138272,0.005102,0.0,0.01031,0.0,0.0,0.002604,0.006803,0.014722,0.044382,0.004305,0.132745,0.111694,0.039435,0.079193,0.105346,0.0,0.0,0.0,0.02036,0.020597,0.056393,0.012011,0.012808,0.044145,0.118627,0.001701,0.001701,0.010977,0.013818,0.0,0.138296,0.0,70.75,0.045121,0.0,0.001701,0.017089,0.0,0.0,0.0,0.019693,0.037816,4
8,2018,1,14,0.075269,0.096774,0.0,0.0,0.075269,0.0,0.032258,0.010753,0.032258,0.064516,0.021505,0.010753,0.0,0.010753,0.021505,0.021505,0.0,0.096774,0.0,0.021505,0.032258,0.0,0.408602,0.0,0.0,0.0,0.032258,0.0,0.0,0.010753,0.0,0.0,0.150538,0.064516,0.0,0.021505,0.0,0.010753,0.010753,0.0,0.032258,0.0,0.0,0.107527,0.139785,0.053763,0.032258,0.129032,0.0,0.0,0.0,0.0,0.032258,0.086022,0.010753,0.032258,0.064516,0.086022,0.032258,0.0,0.0,0.0,0.032258,0.182796,0.0,93.0,0.010753,0.0,0.0,0.0,0.010753,0.0,0.0,0.075269,0.0,1
9,2018,1,17,0.041096,0.020548,0.006849,0.0,0.082192,0.013699,0.0,0.020548,0.013699,0.082192,0.020548,0.0,0.0,0.0,0.041096,0.006849,0.0,0.027397,0.0,0.061644,0.027397,0.0,0.513699,0.0,0.0,0.0,0.054795,0.0,0.006849,0.006849,0.047945,0.0,0.205479,0.034247,0.0,0.006849,0.0,0.006849,0.020548,0.027397,0.013699,0.034247,0.013699,0.089041,0.109589,0.09589,0.109589,0.109589,0.0,0.0,0.0,0.006849,0.0,0.116438,0.020548,0.041096,0.041096,0.205479,0.061644,0.0,0.034247,0.0,0.0,0.123288,0.006849,146.0,0.034247,0.0,0.027397,0.020548,0.0,0.0,0.0,0.006849,0.027397,1


In [152]:
import pandas as pd
import statsmodels.api as sm

# Load your data into a DataFrame
df_embi = pd.read_csv('/Volumes/Drakôn Kholkikos - 2TB/Tesis-Grado/input/EMBI.csv')

In [153]:
import pandas as pd
import statsmodels.api as sm

# Load your data, assuming you have already loaded df_embi and df_liwc

# Set the date range
start_date = '2018-01-01'
end_date = '2022-12-31'

# Convert 'year', 'month', and 'day' columns to datetime
df_embi['date'] = pd.to_datetime(df_embi[['year', 'month', 'day']], format='%Y-%m-%d')
result_df['date'] = pd.to_datetime(result_df[['year', 'month', 'day']], format='%Y-%m-%d')

# Filter the data to the date range
df_embi = df_embi[(df_embi['date'] >= start_date) & (df_embi['date'] <= end_date)].dropna()#.drop(columns=['year', 'month', 'day'])
df_liwc = result_df[(result_df['date'] >= start_date) & (result_df['date'] <= end_date)].dropna()#.drop(columns=['year', 'month', 'day'])

df_embi = df_embi.drop(columns=['year', 'month', 'day'])
df_liwc = df_liwc.drop(columns=['year', 'month', 'day','Obs'])

# Set 'date' as the index for both DataFrames
df_embi.set_index('date', inplace=True)
df_liwc.set_index('date', inplace=True)

# Perform an inner merge on the 'date' index to ensure common dates
merged_data = df_embi.merge(df_liwc, left_index=True, right_index=True)



In [154]:
display(merged_data.head(3))

Unnamed: 0_level_0,EMBI,Adverb,Afect,Amigos,Ansiedad,Articulo,Asentir,Biolog,Causa,Certeza,...,WC,Yo,formal,informal,verbELLOS,verbNOS,verbTU,verbVos,verbYO,verbosEL
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02,347.24,0.051348,0.021231,0.000797,0.000797,0.087747,0.003554,0.017085,0.016335,0.016516,...,286.333333,0.019864,0.0,0.001161,0.016335,0.001595,0.001161,0.0,0.023918,0.035127
2018-01-04,356.23,0.053924,0.024355,0.00576,0.004717,0.079868,0.002358,0.021724,0.019911,0.008118,...,179.5,0.008118,0.0,0.004717,0.010204,0.002358,0.0,0.0,0.024082,0.063904
2018-01-08,365.45,0.029762,0.02381,0.0,0.0,0.136905,0.0,0.005952,0.005952,0.011905,...,168.0,0.005952,0.0,0.0,0.005952,0.017857,0.0,0.0,0.017857,0.035714


In [155]:
# Separate the dependent variable (y) and independent variables (X)
y = list(merged_data['EMBI'])
X = merged_data.drop(columns=['EMBI']).values.tolist()

# Perform the regression analysis
X = sm.add_constant(X)


In [158]:
# Check the data types of y and X
#print("Data type of y:", type(y))
#print("Data type of X:", type(X))
#print(y)
#print(X)
print(len(y))
print(len(X))
Y_temp = []
for item in y:
    Y_temp.append(float(item.replace(',', '')))

y = list(Y_temp)
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and fit the linear regression model on the training data
reg = LinearRegression()
reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = reg.predict(X_test)


# Calculate statistics to evaluate the model
mse = mean_squared_error(np.array(y_test), y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared (R2) Score:", r2)

936
936
Mean Squared Error: 845035.7954909843
R-squared (R2) Score: -0.10999058308769061


In [86]:
X[0]

array([ 1.        , 35.66666667, 15.66666667,  6.        ,  2.66666667,
        0.        ,  0.33333333,  0.        ,  2.66666667,  1.33333333,
        3.33333333,  2.33333333,  4.        ,  0.33333333,  0.        ,
        4.        ,  0.        ,  1.        ,  4.33333333,  2.        ,
        0.        ,  0.66666667,  0.33333333,  0.        ,  0.66666667,
        0.33333333,  0.        ,  1.        ,  0.33333333,  0.33333333,
        0.        ,  0.        ,  0.        ,  0.        ,  2.66666667,
        0.        ,  0.        ,  0.        ,  1.66666667,  1.33333333,
        0.33333333,  0.        ,  0.33333333,  0.        ,  8.66666667,
        2.33333333,  0.        ,  0.        ,  1.66666667,  0.66666667,
        0.        ,  2.33333333,  0.66666667,  2.        ,  0.        ,
        0.        ,  0.33333333,  0.66666667,  0.        ,  0.        ,
        0.33333333,  0.33333333,  1.66666667,  0.        ,  1.33333333,
        0.33333333,  0.        ,  0.33333333,  0.33333333,  0.  

In [60]:
model = sm.OLS(y, X.astype(float))
results = model.fit()

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
model = sm.OLS(y,X)
results = model.fit()
results.params

# model = sm.OLS(y, X).fit()

# Print the regression results
# print(model.summary())


In [37]:
for x in X:
    print(x)
# X_2 = X.values.tolist()
# X_2[1][:2]

[ 1.         35.66666667 15.66666667  6.          2.66666667  0.
  0.33333333  0.          2.66666667  1.33333333  3.33333333  2.33333333
  4.          0.33333333  0.          4.          0.          1.
  4.33333333  2.          0.          0.66666667  0.33333333  0.
  0.66666667  0.33333333  0.          1.          0.33333333  0.33333333
  0.          0.          0.          0.          2.66666667  0.
  0.          0.          1.66666667  1.33333333  0.33333333  0.
  0.33333333  0.          8.66666667  2.33333333  0.          0.
  1.66666667  0.66666667  0.          2.33333333  0.66666667  2.
  0.          0.          0.33333333  0.66666667  0.          0.
  0.33333333  0.33333333  1.66666667  0.          1.33333333  0.33333333
  0.          0.33333333  0.33333333  0.          0.          0.
  0.          0.          0.          0.        ]
[ 1.  36.  16.5  5.5  2.   0.   0.   0.   2.   1.   3.5  3.   2.   0.
  0.   2.   0.   1.5  5.5  2.   1.   0.5  0.   0.   0.   0.   0.   1.
  0.  

In [20]:
print(y)#one col


['347.24', '356.23', '365.45', '360.94', '363.59', '360', '365.37', '366.58', '368.72', '376.11', '378.91', '374.5', '377.82', '398.27', '395.44', '384.38', '407.12', '428.01', '414.47', '423.85', '415.43', '392.41', '398.73', '394.81', '403.55', '405.44', '397.66', '405.11', '418.72', '413.28', '411.17', '409.11', '415.57', '407.53', '388.51', '403.9', '420.94', '412.63', '420.11', '427.19', '421.19', '419.5', '425.3', '417.38', '416.93', '419.34', '414.85', '405.57', '395.99', '398.23', '400.14', '402.62', '418.68', '419.15', '432.33', '469.33', '453.56', '488.62', '488.35', '467.46', '479.25', '480.21', '448.79', '459.53', '457.2', '453.72', '496.67', '500.59', '521.16', '511.8', '480.21', '472.92', '465.31', '478.71', '473.87', '485.66', '502.35', '549.91', '565.52', '542.57', '533.21', '546.32', '558.33', '583.91', '608.42', '563.98', '566.72', '578.1', '568.56', '575.42', '582.76', '574.46', '581.65', '571.87', '576.13', '574.35', '553.42', '548.98', '549.56', '555.77', '558.82',

In [None]:
print(X)#list of lists [35 15 6 2.6]

In [None]:
while pd.option_context('display.max_rows', None):
    display(X.dtypes)
    display(y.dtypes)


In [None]:
# Set the date range
start_date = '2018-01-01'
end_date = '2022-12-31'

# Convert 'year', 'month', and 'day' columns to datetime
df_embi['date'] = pd.to_datetime(df_embi[['year', 'month', 'day']], format='%Y-%m-%d')

# Create a boolean mask for the date range
mask = (df_embi['date'] >= start_date) & (df_embi['date'] <= end_date)

# Apply the mask to select rows within the date range
selected_data = df_embi[mask]
selected_data.set_index("date", inplace=True)

# Convert 'year', 'month', and 'day' columns to datetime
result_df['date'] = pd.to_datetime(result_df[['year', 'month', 'day']], format='%Y-%m-%d')

# Create a boolean mask for the date range
mask = (result_df['date'] >= start_date) & (result_df['date'] <= end_date)

# Apply the mask to select rows within the date range
df_liwc = result_df[mask]
df_liwc.set_index("date", inplace=True)

In [86]:
y = selected_data['EMBI']
X = df_liwc[['WC', 'Funct', 'TotPron', 'PronPer', 'Yo', 'Nosotro', 'TuUtd', 'ElElla', 'Ellos', 'PronImp',
          'Articulo', 'Verbos', 'VerbAux', 'Pasado', 'Present', 'Futuro', 'Adverb', 'Prepos', 'Conjunc',
          'Negacio', 'Cuantif', 'Numeros', 'Maldec', 'verbYO', 'verbTU', 'verbNOS', 'verbosEL',
          'verbELLOS', 'Subjuntiv', 'VosUtds', 'formal', 'informal', 'verbVos', 'Social', 'Familia',
          'Amigos', 'Humanos', 'Afect', 'EmoPos', 'EmoNeg', 'Ansiedad', 'Enfado', 'Triste', 'MecCog',
          'Insight', 'Causa', 'Discrep', 'Tentat', 'Certeza', 'Inhib', 'Incl', 'Excl', 'Percept', 'Ver',
          'Oir', 'Sentir', 'Biolog', 'Cuerpo', 'Salud', 'Sexual', 'Ingerir', 'Relativ', 'Movim', 'Espacio',
          'Tiempo', 'Trabajo', 'Logro', 'Placer', 'Hogar', 'Dinero', 'Relig', 'Muerte', 'Asentir', 'NoFluen',
          'Relleno']]

# Extract common dates
common_dates = selected_data.index.intersection(df_liwc.index)
common_dates_str = common_dates.strftime('%Y-%m-%d')

# Filter X and y using the extracted date
X = df_liwc.loc[common_dates]
y = selected_data['EMBI'].loc[common_dates]

In [95]:

y = pd.to_numeric(y, errors='coerce')

for key in X:
    X[key] = pd.to_numeric(X[key], errors='coerce')
X = X.dropna()
y = y.dropna()
# y = y[X.index]  # Update y to match the new X
# Extract common dates
common_dates = selected_data.index.intersection(df_liwc.index)
common_dates_str = common_dates.strftime('%Y-%m-%d')



# Extract common dates
common_dates = selected_data.index.intersection(X.index)
common_dates_str = common_dates.strftime('%Y-%m-%d')
# Filter X and y using the extracted date
X = X.loc[common_dates]
y = y.loc[common_dates]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[key] = pd.to_numeric(X[key], errors='coerce')


KeyError: "[Timestamp('2018-01-01 00:00:00'), Timestamp('2018-01-11 00:00:00'), Timestamp('2018-01-15 00:00:00'), Timestamp('2018-02-19 00:00:00'), Timestamp('2018-03-30 00:00:00'), Timestamp('2018-05-28 00:00:00'), Timestamp('2018-07-04 00:00:00'), Timestamp('2018-10-08 00:00:00'), Timestamp('2018-11-12 00:00:00'), Timestamp('2018-11-22 00:00:00'), Timestamp('2018-12-05 00:00:00'), Timestamp('2018-12-25 00:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-21 00:00:00'), Timestamp('2019-02-18 00:00:00'), Timestamp('2019-04-19 00:00:00'), Timestamp('2019-05-27 00:00:00'), Timestamp('2019-06-03 00:00:00'), Timestamp('2019-07-04 00:00:00'), Timestamp('2019-08-12 00:00:00'), Timestamp('2019-08-13 00:00:00'), Timestamp('2019-08-14 00:00:00'), Timestamp('2019-08-15 00:00:00'), Timestamp('2019-08-16 00:00:00'), Timestamp('2019-08-19 00:00:00'), Timestamp('2019-08-20 00:00:00'), Timestamp('2019-08-21 00:00:00'), Timestamp('2019-08-22 00:00:00'), Timestamp('2019-08-23 00:00:00'), Timestamp('2019-08-26 00:00:00'), Timestamp('2019-08-27 00:00:00'), Timestamp('2019-08-28 00:00:00'), Timestamp('2019-08-29 00:00:00'), Timestamp('2019-08-30 00:00:00'), Timestamp('2019-09-02 00:00:00'), Timestamp('2019-09-03 00:00:00'), Timestamp('2019-09-04 00:00:00'), Timestamp('2019-09-05 00:00:00'), Timestamp('2019-09-06 00:00:00'), Timestamp('2019-09-09 00:00:00'), Timestamp('2019-09-10 00:00:00'), Timestamp('2019-09-11 00:00:00'), Timestamp('2019-09-12 00:00:00'), Timestamp('2019-09-13 00:00:00'), Timestamp('2019-09-16 00:00:00'), Timestamp('2019-09-17 00:00:00'), Timestamp('2019-09-18 00:00:00'), Timestamp('2019-09-19 00:00:00'), Timestamp('2019-09-20 00:00:00'), Timestamp('2019-09-23 00:00:00'), Timestamp('2019-09-24 00:00:00'), Timestamp('2019-09-25 00:00:00'), Timestamp('2019-09-26 00:00:00'), Timestamp('2019-09-27 00:00:00'), Timestamp('2019-09-30 00:00:00'), Timestamp('2019-10-01 00:00:00'), Timestamp('2019-10-02 00:00:00'), Timestamp('2019-10-03 00:00:00'), Timestamp('2019-10-04 00:00:00'), Timestamp('2019-10-07 00:00:00'), Timestamp('2019-10-08 00:00:00'), Timestamp('2019-10-09 00:00:00'), Timestamp('2019-10-10 00:00:00'), Timestamp('2019-10-11 00:00:00'), Timestamp('2019-10-14 00:00:00'), Timestamp('2019-10-15 00:00:00'), Timestamp('2019-10-16 00:00:00'), Timestamp('2019-10-17 00:00:00'), Timestamp('2019-10-18 00:00:00'), Timestamp('2019-10-21 00:00:00'), Timestamp('2019-10-22 00:00:00'), Timestamp('2019-10-23 00:00:00'), Timestamp('2019-10-24 00:00:00'), Timestamp('2019-10-25 00:00:00'), Timestamp('2019-10-28 00:00:00'), Timestamp('2019-10-29 00:00:00'), Timestamp('2019-10-30 00:00:00'), Timestamp('2019-10-31 00:00:00'), Timestamp('2019-11-01 00:00:00'), Timestamp('2019-11-04 00:00:00'), Timestamp('2019-11-05 00:00:00'), Timestamp('2019-11-06 00:00:00'), Timestamp('2019-11-07 00:00:00'), Timestamp('2019-11-08 00:00:00'), Timestamp('2019-11-11 00:00:00'), Timestamp('2019-11-12 00:00:00'), Timestamp('2019-11-13 00:00:00'), Timestamp('2019-11-14 00:00:00'), Timestamp('2019-11-15 00:00:00'), Timestamp('2019-11-18 00:00:00'), Timestamp('2019-11-19 00:00:00'), Timestamp('2019-11-20 00:00:00'), Timestamp('2019-11-21 00:00:00'), Timestamp('2019-11-22 00:00:00'), Timestamp('2019-11-25 00:00:00'), Timestamp('2019-11-26 00:00:00'), Timestamp('2019-11-27 00:00:00'), Timestamp('2019-11-28 00:00:00'), Timestamp('2019-11-29 00:00:00'), Timestamp('2019-12-02 00:00:00'), Timestamp('2019-12-03 00:00:00'), Timestamp('2019-12-04 00:00:00'), Timestamp('2019-12-05 00:00:00'), Timestamp('2019-12-06 00:00:00'), Timestamp('2019-12-09 00:00:00'), Timestamp('2019-12-10 00:00:00'), Timestamp('2019-12-11 00:00:00'), Timestamp('2019-12-12 00:00:00'), Timestamp('2019-12-13 00:00:00'), Timestamp('2019-12-16 00:00:00'), Timestamp('2019-12-17 00:00:00'), Timestamp('2019-12-18 00:00:00'), Timestamp('2019-12-19 00:00:00'), Timestamp('2019-12-20 00:00:00'), Timestamp('2019-12-23 00:00:00'), Timestamp('2019-12-24 00:00:00'), Timestamp('2019-12-25 00:00:00'), Timestamp('2019-12-26 00:00:00'), Timestamp('2019-12-27 00:00:00'), Timestamp('2019-12-30 00:00:00'), Timestamp('2019-12-31 00:00:00'), Timestamp('2020-01-01 00:00:00'), Timestamp('2020-01-02 00:00:00'), Timestamp('2020-01-03 00:00:00'), Timestamp('2020-01-06 00:00:00'), Timestamp('2020-01-07 00:00:00'), Timestamp('2020-01-08 00:00:00'), Timestamp('2020-01-09 00:00:00'), Timestamp('2020-01-10 00:00:00'), Timestamp('2020-01-13 00:00:00'), Timestamp('2020-01-14 00:00:00'), Timestamp('2020-01-15 00:00:00'), Timestamp('2020-01-16 00:00:00'), Timestamp('2020-01-17 00:00:00'), Timestamp('2020-01-20 00:00:00'), Timestamp('2020-01-21 00:00:00'), Timestamp('2020-01-22 00:00:00'), Timestamp('2020-01-23 00:00:00'), Timestamp('2020-01-24 00:00:00'), Timestamp('2020-01-27 00:00:00'), Timestamp('2020-01-28 00:00:00'), Timestamp('2020-01-29 00:00:00'), Timestamp('2020-01-30 00:00:00'), Timestamp('2020-01-31 00:00:00'), Timestamp('2020-02-03 00:00:00'), Timestamp('2020-02-04 00:00:00'), Timestamp('2020-02-05 00:00:00'), Timestamp('2020-02-06 00:00:00'), Timestamp('2020-02-07 00:00:00'), Timestamp('2020-02-10 00:00:00'), Timestamp('2020-02-11 00:00:00'), Timestamp('2020-02-12 00:00:00'), Timestamp('2020-02-13 00:00:00'), Timestamp('2020-02-14 00:00:00'), Timestamp('2020-02-17 00:00:00'), Timestamp('2020-02-18 00:00:00'), Timestamp('2020-02-19 00:00:00'), Timestamp('2020-02-20 00:00:00'), Timestamp('2020-02-21 00:00:00'), Timestamp('2020-02-24 00:00:00'), Timestamp('2020-02-25 00:00:00'), Timestamp('2020-02-26 00:00:00'), Timestamp('2020-02-27 00:00:00'), Timestamp('2020-02-28 00:00:00'), Timestamp('2020-03-02 00:00:00'), Timestamp('2020-03-03 00:00:00'), Timestamp('2020-03-04 00:00:00'), Timestamp('2020-03-05 00:00:00'), Timestamp('2020-03-06 00:00:00'), Timestamp('2020-03-09 00:00:00'), Timestamp('2020-03-10 00:00:00'), Timestamp('2020-03-11 00:00:00'), Timestamp('2020-03-12 00:00:00'), Timestamp('2020-03-13 00:00:00'), Timestamp('2020-03-16 00:00:00'), Timestamp('2020-03-17 00:00:00'), Timestamp('2020-03-18 00:00:00'), Timestamp('2020-03-19 00:00:00'), Timestamp('2020-03-20 00:00:00'), Timestamp('2020-03-23 00:00:00'), Timestamp('2020-03-24 00:00:00'), Timestamp('2020-03-25 00:00:00'), Timestamp('2020-03-26 00:00:00'), Timestamp('2020-03-27 00:00:00'), Timestamp('2020-03-30 00:00:00'), Timestamp('2020-03-31 00:00:00'), Timestamp('2020-04-01 00:00:00'), Timestamp('2020-04-02 00:00:00'), Timestamp('2020-04-03 00:00:00'), Timestamp('2020-04-06 00:00:00'), Timestamp('2020-04-07 00:00:00'), Timestamp('2020-04-08 00:00:00'), Timestamp('2020-04-09 00:00:00'), Timestamp('2020-04-10 00:00:00'), Timestamp('2020-04-13 00:00:00'), Timestamp('2020-04-14 00:00:00'), Timestamp('2020-04-15 00:00:00'), Timestamp('2020-04-16 00:00:00'), Timestamp('2020-04-17 00:00:00'), Timestamp('2020-04-20 00:00:00'), Timestamp('2020-04-21 00:00:00'), Timestamp('2020-04-22 00:00:00'), Timestamp('2020-04-23 00:00:00'), Timestamp('2020-04-24 00:00:00'), Timestamp('2020-04-27 00:00:00'), Timestamp('2020-04-28 00:00:00'), Timestamp('2020-04-29 00:00:00'), Timestamp('2020-04-30 00:00:00'), Timestamp('2020-05-01 00:00:00'), Timestamp('2020-05-04 00:00:00'), Timestamp('2020-05-05 00:00:00'), Timestamp('2020-05-06 00:00:00'), Timestamp('2020-05-07 00:00:00'), Timestamp('2020-05-08 00:00:00'), Timestamp('2020-05-11 00:00:00'), Timestamp('2020-05-12 00:00:00'), Timestamp('2020-05-13 00:00:00'), Timestamp('2020-05-14 00:00:00'), Timestamp('2020-05-15 00:00:00'), Timestamp('2020-05-18 00:00:00'), Timestamp('2020-05-19 00:00:00'), Timestamp('2020-05-20 00:00:00'), Timestamp('2020-05-21 00:00:00'), Timestamp('2020-05-22 00:00:00'), Timestamp('2020-05-25 00:00:00'), Timestamp('2020-05-26 00:00:00'), Timestamp('2020-05-27 00:00:00'), Timestamp('2020-05-28 00:00:00'), Timestamp('2020-05-29 00:00:00'), Timestamp('2020-06-01 00:00:00'), Timestamp('2020-06-02 00:00:00'), Timestamp('2020-06-03 00:00:00'), Timestamp('2020-06-04 00:00:00'), Timestamp('2020-06-05 00:00:00'), Timestamp('2020-06-08 00:00:00'), Timestamp('2020-06-09 00:00:00'), Timestamp('2020-06-10 00:00:00'), Timestamp('2020-06-11 00:00:00'), Timestamp('2020-06-12 00:00:00'), Timestamp('2020-06-15 00:00:00'), Timestamp('2020-06-16 00:00:00'), Timestamp('2020-06-17 00:00:00'), Timestamp('2020-06-18 00:00:00'), Timestamp('2020-06-19 00:00:00'), Timestamp('2020-06-22 00:00:00'), Timestamp('2020-06-23 00:00:00'), Timestamp('2020-06-24 00:00:00'), Timestamp('2020-06-25 00:00:00'), Timestamp('2020-06-26 00:00:00'), Timestamp('2020-06-29 00:00:00'), Timestamp('2020-06-30 00:00:00'), Timestamp('2020-07-01 00:00:00'), Timestamp('2020-07-02 00:00:00'), Timestamp('2020-07-03 00:00:00'), Timestamp('2020-07-06 00:00:00'), Timestamp('2020-07-07 00:00:00'), Timestamp('2020-07-08 00:00:00'), Timestamp('2020-07-09 00:00:00'), Timestamp('2020-07-10 00:00:00'), Timestamp('2020-07-13 00:00:00'), Timestamp('2020-07-14 00:00:00'), Timestamp('2020-07-15 00:00:00'), Timestamp('2020-07-16 00:00:00'), Timestamp('2020-07-17 00:00:00'), Timestamp('2020-07-20 00:00:00'), Timestamp('2020-07-21 00:00:00'), Timestamp('2020-07-22 00:00:00'), Timestamp('2020-07-23 00:00:00'), Timestamp('2020-07-24 00:00:00'), Timestamp('2020-07-27 00:00:00'), Timestamp('2020-07-28 00:00:00'), Timestamp('2020-07-29 00:00:00'), Timestamp('2020-07-30 00:00:00'), Timestamp('2020-07-31 00:00:00'), Timestamp('2020-08-03 00:00:00'), Timestamp('2020-08-04 00:00:00'), Timestamp('2020-08-05 00:00:00'), Timestamp('2020-08-06 00:00:00'), Timestamp('2020-08-07 00:00:00'), Timestamp('2020-08-10 00:00:00'), Timestamp('2020-08-11 00:00:00'), Timestamp('2020-08-12 00:00:00'), Timestamp('2020-08-13 00:00:00'), Timestamp('2020-08-14 00:00:00'), Timestamp('2020-08-17 00:00:00'), Timestamp('2020-08-18 00:00:00'), Timestamp('2020-08-19 00:00:00'), Timestamp('2020-08-20 00:00:00'), Timestamp('2020-08-21 00:00:00'), Timestamp('2020-08-24 00:00:00'), Timestamp('2020-08-25 00:00:00'), Timestamp('2020-08-26 00:00:00'), Timestamp('2020-08-27 00:00:00'), Timestamp('2020-08-28 00:00:00'), Timestamp('2020-08-31 00:00:00'), Timestamp('2020-09-01 00:00:00'), Timestamp('2020-09-02 00:00:00'), Timestamp('2020-09-03 00:00:00'), Timestamp('2020-09-04 00:00:00'), Timestamp('2020-09-07 00:00:00'), Timestamp('2020-09-08 00:00:00'), Timestamp('2020-09-09 00:00:00'), Timestamp('2020-09-10 00:00:00'), Timestamp('2020-09-11 00:00:00'), Timestamp('2020-09-14 00:00:00'), Timestamp('2020-09-15 00:00:00'), Timestamp('2020-09-16 00:00:00'), Timestamp('2020-09-17 00:00:00'), Timestamp('2020-09-18 00:00:00'), Timestamp('2020-09-21 00:00:00'), Timestamp('2020-09-22 00:00:00'), Timestamp('2020-09-23 00:00:00'), Timestamp('2020-09-24 00:00:00'), Timestamp('2020-09-25 00:00:00'), Timestamp('2020-09-28 00:00:00'), Timestamp('2020-09-29 00:00:00'), Timestamp('2020-09-30 00:00:00'), Timestamp('2020-10-01 00:00:00'), Timestamp('2020-10-02 00:00:00'), Timestamp('2020-10-05 00:00:00'), Timestamp('2020-10-06 00:00:00'), Timestamp('2020-10-07 00:00:00'), Timestamp('2020-10-08 00:00:00'), Timestamp('2020-10-09 00:00:00'), Timestamp('2020-10-12 00:00:00'), Timestamp('2020-10-13 00:00:00'), Timestamp('2020-10-14 00:00:00'), Timestamp('2020-10-15 00:00:00'), Timestamp('2020-10-16 00:00:00'), Timestamp('2020-10-19 00:00:00'), Timestamp('2020-10-20 00:00:00'), Timestamp('2020-10-21 00:00:00'), Timestamp('2020-10-22 00:00:00'), Timestamp('2020-10-23 00:00:00'), Timestamp('2020-10-26 00:00:00'), Timestamp('2020-10-27 00:00:00'), Timestamp('2020-10-28 00:00:00'), Timestamp('2020-10-29 00:00:00'), Timestamp('2020-10-30 00:00:00'), Timestamp('2020-11-02 00:00:00'), Timestamp('2020-11-03 00:00:00'), Timestamp('2020-11-04 00:00:00'), Timestamp('2020-11-05 00:00:00'), Timestamp('2020-11-06 00:00:00'), Timestamp('2020-11-09 00:00:00'), Timestamp('2020-11-10 00:00:00'), Timestamp('2020-11-11 00:00:00'), Timestamp('2020-11-12 00:00:00'), Timestamp('2020-11-13 00:00:00'), Timestamp('2020-11-16 00:00:00'), Timestamp('2020-11-17 00:00:00'), Timestamp('2020-11-18 00:00:00'), Timestamp('2020-11-19 00:00:00'), Timestamp('2020-11-20 00:00:00'), Timestamp('2020-11-23 00:00:00'), Timestamp('2020-11-24 00:00:00'), Timestamp('2020-11-25 00:00:00'), Timestamp('2020-11-26 00:00:00'), Timestamp('2020-11-27 00:00:00'), Timestamp('2020-11-30 00:00:00'), Timestamp('2020-12-01 00:00:00'), Timestamp('2020-12-02 00:00:00'), Timestamp('2020-12-03 00:00:00'), Timestamp('2020-12-04 00:00:00'), Timestamp('2020-12-07 00:00:00'), Timestamp('2020-12-08 00:00:00'), Timestamp('2020-12-09 00:00:00'), Timestamp('2020-12-10 00:00:00'), Timestamp('2020-12-11 00:00:00'), Timestamp('2020-12-14 00:00:00'), Timestamp('2020-12-15 00:00:00'), Timestamp('2020-12-16 00:00:00'), Timestamp('2020-12-17 00:00:00'), Timestamp('2020-12-18 00:00:00'), Timestamp('2020-12-21 00:00:00'), Timestamp('2020-12-22 00:00:00'), Timestamp('2020-12-23 00:00:00'), Timestamp('2020-12-24 00:00:00'), Timestamp('2020-12-25 00:00:00'), Timestamp('2020-12-28 00:00:00'), Timestamp('2020-12-29 00:00:00'), Timestamp('2020-12-30 00:00:00'), Timestamp('2020-12-31 00:00:00'), Timestamp('2021-01-01 00:00:00'), Timestamp('2021-01-04 00:00:00'), Timestamp('2021-01-05 00:00:00'), Timestamp('2021-01-06 00:00:00'), Timestamp('2021-01-07 00:00:00'), Timestamp('2021-01-08 00:00:00'), Timestamp('2021-01-11 00:00:00'), Timestamp('2021-01-12 00:00:00'), Timestamp('2021-01-13 00:00:00'), Timestamp('2021-01-14 00:00:00'), Timestamp('2021-01-15 00:00:00'), Timestamp('2021-01-18 00:00:00'), Timestamp('2021-01-19 00:00:00'), Timestamp('2021-01-20 00:00:00'), Timestamp('2021-01-21 00:00:00'), Timestamp('2021-01-22 00:00:00'), Timestamp('2021-01-25 00:00:00'), Timestamp('2021-01-26 00:00:00'), Timestamp('2021-01-27 00:00:00'), Timestamp('2021-01-28 00:00:00'), Timestamp('2021-01-29 00:00:00'), Timestamp('2021-02-01 00:00:00'), Timestamp('2021-02-02 00:00:00'), Timestamp('2021-02-03 00:00:00'), Timestamp('2021-02-04 00:00:00'), Timestamp('2021-02-05 00:00:00'), Timestamp('2021-02-08 00:00:00'), Timestamp('2021-02-09 00:00:00'), Timestamp('2021-02-10 00:00:00'), Timestamp('2021-02-11 00:00:00'), Timestamp('2021-02-12 00:00:00'), Timestamp('2021-02-15 00:00:00'), Timestamp('2021-02-16 00:00:00'), Timestamp('2021-02-17 00:00:00'), Timestamp('2021-02-18 00:00:00'), Timestamp('2021-02-19 00:00:00'), Timestamp('2021-02-22 00:00:00'), Timestamp('2021-02-23 00:00:00'), Timestamp('2021-02-24 00:00:00'), Timestamp('2021-02-25 00:00:00'), Timestamp('2021-02-26 00:00:00'), Timestamp('2021-03-01 00:00:00'), Timestamp('2021-03-02 00:00:00'), Timestamp('2021-03-03 00:00:00'), Timestamp('2021-03-04 00:00:00'), Timestamp('2021-03-05 00:00:00'), Timestamp('2021-03-08 00:00:00'), Timestamp('2021-03-09 00:00:00'), Timestamp('2021-03-10 00:00:00'), Timestamp('2021-03-11 00:00:00'), Timestamp('2021-03-12 00:00:00'), Timestamp('2021-03-15 00:00:00'), Timestamp('2021-03-16 00:00:00'), Timestamp('2021-03-17 00:00:00'), Timestamp('2021-03-18 00:00:00'), Timestamp('2021-03-19 00:00:00'), Timestamp('2021-03-22 00:00:00'), Timestamp('2021-03-23 00:00:00'), Timestamp('2021-03-24 00:00:00'), Timestamp('2021-03-25 00:00:00'), Timestamp('2021-03-26 00:00:00'), Timestamp('2021-03-29 00:00:00'), Timestamp('2021-03-30 00:00:00'), Timestamp('2021-03-31 00:00:00'), Timestamp('2021-04-01 00:00:00'), Timestamp('2021-04-02 00:00:00'), Timestamp('2021-04-05 00:00:00'), Timestamp('2021-04-06 00:00:00'), Timestamp('2021-04-07 00:00:00'), Timestamp('2021-04-08 00:00:00'), Timestamp('2021-04-09 00:00:00'), Timestamp('2021-04-12 00:00:00'), Timestamp('2021-04-13 00:00:00'), Timestamp('2021-04-14 00:00:00'), Timestamp('2021-04-15 00:00:00'), Timestamp('2021-04-16 00:00:00'), Timestamp('2021-04-19 00:00:00'), Timestamp('2021-04-20 00:00:00'), Timestamp('2021-04-21 00:00:00'), Timestamp('2021-04-22 00:00:00'), Timestamp('2021-04-23 00:00:00'), Timestamp('2021-04-26 00:00:00'), Timestamp('2021-04-27 00:00:00'), Timestamp('2021-04-28 00:00:00'), Timestamp('2021-04-29 00:00:00'), Timestamp('2021-04-30 00:00:00'), Timestamp('2021-05-03 00:00:00'), Timestamp('2021-05-04 00:00:00'), Timestamp('2021-05-05 00:00:00'), Timestamp('2021-05-06 00:00:00'), Timestamp('2021-05-07 00:00:00'), Timestamp('2021-05-10 00:00:00'), Timestamp('2021-05-11 00:00:00'), Timestamp('2021-05-12 00:00:00'), Timestamp('2021-05-13 00:00:00'), Timestamp('2021-05-14 00:00:00'), Timestamp('2021-05-17 00:00:00'), Timestamp('2021-05-18 00:00:00'), Timestamp('2021-05-19 00:00:00'), Timestamp('2021-05-20 00:00:00'), Timestamp('2021-05-21 00:00:00'), Timestamp('2021-05-24 00:00:00'), Timestamp('2021-05-25 00:00:00'), Timestamp('2021-05-26 00:00:00'), Timestamp('2021-05-27 00:00:00'), Timestamp('2021-05-28 00:00:00'), Timestamp('2021-05-31 00:00:00'), Timestamp('2021-06-01 00:00:00'), Timestamp('2021-06-02 00:00:00'), Timestamp('2021-06-03 00:00:00'), Timestamp('2021-06-04 00:00:00'), Timestamp('2021-06-07 00:00:00'), Timestamp('2021-06-08 00:00:00'), Timestamp('2021-06-09 00:00:00'), Timestamp('2021-06-10 00:00:00'), Timestamp('2021-06-11 00:00:00'), Timestamp('2021-06-14 00:00:00'), Timestamp('2021-06-15 00:00:00'), Timestamp('2021-06-16 00:00:00'), Timestamp('2021-06-17 00:00:00'), Timestamp('2021-06-18 00:00:00'), Timestamp('2021-06-21 00:00:00'), Timestamp('2021-06-22 00:00:00'), Timestamp('2021-06-23 00:00:00'), Timestamp('2021-06-24 00:00:00'), Timestamp('2021-06-25 00:00:00'), Timestamp('2021-06-28 00:00:00'), Timestamp('2021-06-29 00:00:00'), Timestamp('2021-06-30 00:00:00'), Timestamp('2021-07-01 00:00:00'), Timestamp('2021-07-02 00:00:00'), Timestamp('2021-07-05 00:00:00'), Timestamp('2021-07-06 00:00:00'), Timestamp('2021-07-07 00:00:00'), Timestamp('2021-07-08 00:00:00'), Timestamp('2021-07-09 00:00:00'), Timestamp('2021-07-12 00:00:00'), Timestamp('2021-07-13 00:00:00'), Timestamp('2021-07-14 00:00:00'), Timestamp('2021-07-15 00:00:00'), Timestamp('2021-07-16 00:00:00'), Timestamp('2021-07-19 00:00:00'), Timestamp('2021-07-20 00:00:00'), Timestamp('2021-07-21 00:00:00'), Timestamp('2021-07-22 00:00:00'), Timestamp('2021-07-23 00:00:00'), Timestamp('2021-07-26 00:00:00'), Timestamp('2021-07-27 00:00:00'), Timestamp('2021-07-28 00:00:00'), Timestamp('2021-07-29 00:00:00'), Timestamp('2021-07-30 00:00:00'), Timestamp('2021-08-02 00:00:00'), Timestamp('2021-08-03 00:00:00'), Timestamp('2021-08-04 00:00:00'), Timestamp('2021-08-05 00:00:00'), Timestamp('2021-08-06 00:00:00'), Timestamp('2021-08-09 00:00:00'), Timestamp('2021-08-10 00:00:00'), Timestamp('2021-08-11 00:00:00'), Timestamp('2021-08-12 00:00:00'), Timestamp('2021-08-13 00:00:00'), Timestamp('2021-08-16 00:00:00'), Timestamp('2021-08-17 00:00:00'), Timestamp('2021-08-18 00:00:00'), Timestamp('2021-08-19 00:00:00'), Timestamp('2021-08-20 00:00:00'), Timestamp('2021-08-23 00:00:00'), Timestamp('2021-08-24 00:00:00'), Timestamp('2021-08-25 00:00:00'), Timestamp('2021-08-26 00:00:00'), Timestamp('2021-08-27 00:00:00'), Timestamp('2021-08-30 00:00:00'), Timestamp('2021-08-31 00:00:00'), Timestamp('2021-09-01 00:00:00'), Timestamp('2021-09-02 00:00:00'), Timestamp('2021-09-03 00:00:00'), Timestamp('2021-09-06 00:00:00'), Timestamp('2021-09-07 00:00:00'), Timestamp('2021-09-08 00:00:00'), Timestamp('2021-09-09 00:00:00'), Timestamp('2021-09-10 00:00:00'), Timestamp('2021-09-13 00:00:00'), Timestamp('2021-09-14 00:00:00'), Timestamp('2021-09-15 00:00:00'), Timestamp('2021-09-16 00:00:00'), Timestamp('2021-09-17 00:00:00'), Timestamp('2021-09-20 00:00:00'), Timestamp('2021-09-21 00:00:00'), Timestamp('2021-09-22 00:00:00'), Timestamp('2021-09-23 00:00:00'), Timestamp('2021-09-24 00:00:00'), Timestamp('2021-09-27 00:00:00'), Timestamp('2021-09-28 00:00:00'), Timestamp('2021-09-29 00:00:00'), Timestamp('2021-09-30 00:00:00'), Timestamp('2021-10-01 00:00:00'), Timestamp('2021-10-04 00:00:00'), Timestamp('2021-10-05 00:00:00'), Timestamp('2021-10-06 00:00:00'), Timestamp('2021-10-07 00:00:00'), Timestamp('2021-10-08 00:00:00'), Timestamp('2021-10-11 00:00:00'), Timestamp('2021-10-12 00:00:00'), Timestamp('2021-10-13 00:00:00'), Timestamp('2021-10-14 00:00:00'), Timestamp('2021-10-15 00:00:00'), Timestamp('2021-10-18 00:00:00'), Timestamp('2021-10-19 00:00:00'), Timestamp('2021-10-20 00:00:00'), Timestamp('2021-10-21 00:00:00'), Timestamp('2021-10-22 00:00:00'), Timestamp('2021-10-25 00:00:00'), Timestamp('2021-10-26 00:00:00'), Timestamp('2021-10-27 00:00:00'), Timestamp('2021-10-28 00:00:00'), Timestamp('2021-10-29 00:00:00'), Timestamp('2021-11-01 00:00:00'), Timestamp('2021-11-02 00:00:00'), Timestamp('2021-11-03 00:00:00'), Timestamp('2021-11-04 00:00:00'), Timestamp('2021-11-05 00:00:00'), Timestamp('2021-11-08 00:00:00'), Timestamp('2021-11-09 00:00:00'), Timestamp('2021-11-10 00:00:00'), Timestamp('2021-11-11 00:00:00'), Timestamp('2021-11-12 00:00:00'), Timestamp('2021-11-15 00:00:00'), Timestamp('2021-11-16 00:00:00'), Timestamp('2021-11-17 00:00:00'), Timestamp('2021-11-18 00:00:00'), Timestamp('2021-11-19 00:00:00'), Timestamp('2021-11-22 00:00:00'), Timestamp('2021-11-23 00:00:00'), Timestamp('2021-11-24 00:00:00'), Timestamp('2021-11-25 00:00:00'), Timestamp('2021-11-26 00:00:00'), Timestamp('2021-11-29 00:00:00'), Timestamp('2021-11-30 00:00:00'), Timestamp('2021-12-01 00:00:00'), Timestamp('2021-12-02 00:00:00'), Timestamp('2021-12-03 00:00:00'), Timestamp('2021-12-06 00:00:00'), Timestamp('2021-12-07 00:00:00'), Timestamp('2021-12-08 00:00:00'), Timestamp('2021-12-09 00:00:00'), Timestamp('2021-12-10 00:00:00'), Timestamp('2021-12-13 00:00:00'), Timestamp('2021-12-14 00:00:00'), Timestamp('2021-12-15 00:00:00'), Timestamp('2021-12-16 00:00:00'), Timestamp('2021-12-17 00:00:00'), Timestamp('2021-12-20 00:00:00'), Timestamp('2021-12-21 00:00:00'), Timestamp('2021-12-22 00:00:00'), Timestamp('2021-12-23 00:00:00'), Timestamp('2021-12-24 00:00:00'), Timestamp('2021-12-27 00:00:00'), Timestamp('2021-12-28 00:00:00'), Timestamp('2021-12-29 00:00:00'), Timestamp('2021-12-30 00:00:00'), Timestamp('2021-12-31 00:00:00'), Timestamp('2022-01-03 00:00:00'), Timestamp('2022-01-04 00:00:00'), Timestamp('2022-01-05 00:00:00'), Timestamp('2022-01-06 00:00:00'), Timestamp('2022-01-07 00:00:00'), Timestamp('2022-01-10 00:00:00'), Timestamp('2022-01-11 00:00:00'), Timestamp('2022-01-12 00:00:00'), Timestamp('2022-01-13 00:00:00'), Timestamp('2022-01-14 00:00:00'), Timestamp('2022-01-17 00:00:00'), Timestamp('2022-01-18 00:00:00'), Timestamp('2022-01-19 00:00:00'), Timestamp('2022-01-20 00:00:00'), Timestamp('2022-01-21 00:00:00'), Timestamp('2022-01-24 00:00:00'), Timestamp('2022-01-25 00:00:00'), Timestamp('2022-01-26 00:00:00'), Timestamp('2022-01-27 00:00:00'), Timestamp('2022-01-28 00:00:00'), Timestamp('2022-01-31 00:00:00'), Timestamp('2022-02-01 00:00:00'), Timestamp('2022-02-02 00:00:00'), Timestamp('2022-02-03 00:00:00'), Timestamp('2022-02-04 00:00:00'), Timestamp('2022-02-07 00:00:00'), Timestamp('2022-02-08 00:00:00'), Timestamp('2022-02-09 00:00:00'), Timestamp('2022-02-10 00:00:00'), Timestamp('2022-02-11 00:00:00'), Timestamp('2022-02-14 00:00:00'), Timestamp('2022-02-15 00:00:00'), Timestamp('2022-02-16 00:00:00'), Timestamp('2022-02-17 00:00:00'), Timestamp('2022-02-18 00:00:00'), Timestamp('2022-02-21 00:00:00'), Timestamp('2022-02-22 00:00:00'), Timestamp('2022-02-23 00:00:00'), Timestamp('2022-02-24 00:00:00'), Timestamp('2022-02-25 00:00:00'), Timestamp('2022-02-28 00:00:00'), Timestamp('2022-03-01 00:00:00'), Timestamp('2022-03-02 00:00:00'), Timestamp('2022-03-03 00:00:00'), Timestamp('2022-03-04 00:00:00'), Timestamp('2022-03-07 00:00:00'), Timestamp('2022-03-08 00:00:00'), Timestamp('2022-03-09 00:00:00'), Timestamp('2022-03-10 00:00:00'), Timestamp('2022-03-11 00:00:00'), Timestamp('2022-03-14 00:00:00'), Timestamp('2022-03-15 00:00:00'), Timestamp('2022-03-16 00:00:00'), Timestamp('2022-03-17 00:00:00'), Timestamp('2022-03-18 00:00:00'), Timestamp('2022-03-21 00:00:00'), Timestamp('2022-03-22 00:00:00'), Timestamp('2022-03-23 00:00:00'), Timestamp('2022-03-24 00:00:00'), Timestamp('2022-03-25 00:00:00'), Timestamp('2022-03-28 00:00:00'), Timestamp('2022-03-29 00:00:00'), Timestamp('2022-03-30 00:00:00'), Timestamp('2022-03-31 00:00:00'), Timestamp('2022-04-01 00:00:00'), Timestamp('2022-04-04 00:00:00'), Timestamp('2022-04-05 00:00:00'), Timestamp('2022-04-06 00:00:00'), Timestamp('2022-04-07 00:00:00'), Timestamp('2022-04-08 00:00:00'), Timestamp('2022-04-11 00:00:00'), Timestamp('2022-04-12 00:00:00'), Timestamp('2022-04-13 00:00:00'), Timestamp('2022-04-14 00:00:00'), Timestamp('2022-04-15 00:00:00'), Timestamp('2022-04-18 00:00:00'), Timestamp('2022-04-19 00:00:00'), Timestamp('2022-04-20 00:00:00'), Timestamp('2022-04-21 00:00:00'), Timestamp('2022-04-22 00:00:00'), Timestamp('2022-04-25 00:00:00'), Timestamp('2022-04-26 00:00:00'), Timestamp('2022-04-27 00:00:00'), Timestamp('2022-04-28 00:00:00'), Timestamp('2022-04-29 00:00:00'), Timestamp('2022-05-02 00:00:00'), Timestamp('2022-05-03 00:00:00'), Timestamp('2022-05-04 00:00:00'), Timestamp('2022-05-05 00:00:00'), Timestamp('2022-05-06 00:00:00'), Timestamp('2022-05-09 00:00:00'), Timestamp('2022-05-10 00:00:00'), Timestamp('2022-05-11 00:00:00'), Timestamp('2022-05-12 00:00:00'), Timestamp('2022-05-13 00:00:00'), Timestamp('2022-05-16 00:00:00'), Timestamp('2022-05-17 00:00:00'), Timestamp('2022-05-18 00:00:00'), Timestamp('2022-05-19 00:00:00'), Timestamp('2022-05-20 00:00:00'), Timestamp('2022-05-23 00:00:00'), Timestamp('2022-05-24 00:00:00'), Timestamp('2022-05-25 00:00:00'), Timestamp('2022-05-26 00:00:00'), Timestamp('2022-05-27 00:00:00'), Timestamp('2022-05-30 00:00:00'), Timestamp('2022-05-31 00:00:00'), Timestamp('2022-06-01 00:00:00'), Timestamp('2022-06-02 00:00:00'), Timestamp('2022-06-03 00:00:00'), Timestamp('2022-06-06 00:00:00'), Timestamp('2022-06-07 00:00:00'), Timestamp('2022-06-08 00:00:00'), Timestamp('2022-06-09 00:00:00'), Timestamp('2022-06-10 00:00:00'), Timestamp('2022-06-13 00:00:00'), Timestamp('2022-06-14 00:00:00'), Timestamp('2022-06-15 00:00:00'), Timestamp('2022-06-16 00:00:00'), Timestamp('2022-06-17 00:00:00'), Timestamp('2022-06-20 00:00:00'), Timestamp('2022-06-21 00:00:00'), Timestamp('2022-06-22 00:00:00'), Timestamp('2022-06-23 00:00:00'), Timestamp('2022-06-24 00:00:00'), Timestamp('2022-06-27 00:00:00'), Timestamp('2022-06-28 00:00:00'), Timestamp('2022-06-29 00:00:00'), Timestamp('2022-06-30 00:00:00'), Timestamp('2022-07-01 00:00:00'), Timestamp('2022-07-04 00:00:00'), Timestamp('2022-07-05 00:00:00'), Timestamp('2022-07-06 00:00:00'), Timestamp('2022-07-07 00:00:00'), Timestamp('2022-07-08 00:00:00'), Timestamp('2022-07-11 00:00:00'), Timestamp('2022-07-12 00:00:00'), Timestamp('2022-07-13 00:00:00'), Timestamp('2022-07-14 00:00:00'), Timestamp('2022-07-15 00:00:00'), Timestamp('2022-07-18 00:00:00'), Timestamp('2022-07-19 00:00:00'), Timestamp('2022-07-20 00:00:00'), Timestamp('2022-07-21 00:00:00'), Timestamp('2022-07-22 00:00:00'), Timestamp('2022-07-25 00:00:00'), Timestamp('2022-07-26 00:00:00'), Timestamp('2022-07-27 00:00:00'), Timestamp('2022-07-28 00:00:00'), Timestamp('2022-07-29 00:00:00'), Timestamp('2022-08-01 00:00:00'), Timestamp('2022-08-02 00:00:00'), Timestamp('2022-08-03 00:00:00'), Timestamp('2022-08-04 00:00:00'), Timestamp('2022-08-05 00:00:00'), Timestamp('2022-08-08 00:00:00'), Timestamp('2022-08-09 00:00:00'), Timestamp('2022-08-10 00:00:00'), Timestamp('2022-08-11 00:00:00'), Timestamp('2022-08-12 00:00:00'), Timestamp('2022-08-15 00:00:00'), Timestamp('2022-08-16 00:00:00'), Timestamp('2022-08-17 00:00:00'), Timestamp('2022-08-18 00:00:00'), Timestamp('2022-08-19 00:00:00'), Timestamp('2022-08-22 00:00:00'), Timestamp('2022-08-23 00:00:00'), Timestamp('2022-08-24 00:00:00'), Timestamp('2022-08-25 00:00:00'), Timestamp('2022-08-26 00:00:00'), Timestamp('2022-08-29 00:00:00'), Timestamp('2022-08-30 00:00:00'), Timestamp('2022-08-31 00:00:00'), Timestamp('2022-09-01 00:00:00'), Timestamp('2022-09-02 00:00:00'), Timestamp('2022-09-05 00:00:00'), Timestamp('2022-09-06 00:00:00'), Timestamp('2022-09-07 00:00:00'), Timestamp('2022-09-08 00:00:00'), Timestamp('2022-09-09 00:00:00'), Timestamp('2022-09-12 00:00:00'), Timestamp('2022-09-13 00:00:00'), Timestamp('2022-09-14 00:00:00'), Timestamp('2022-09-15 00:00:00'), Timestamp('2022-09-16 00:00:00'), Timestamp('2022-09-19 00:00:00'), Timestamp('2022-09-20 00:00:00'), Timestamp('2022-09-21 00:00:00'), Timestamp('2022-09-22 00:00:00'), Timestamp('2022-09-23 00:00:00'), Timestamp('2022-09-26 00:00:00'), Timestamp('2022-09-27 00:00:00'), Timestamp('2022-09-28 00:00:00'), Timestamp('2022-09-29 00:00:00'), Timestamp('2022-09-30 00:00:00'), Timestamp('2022-10-03 00:00:00'), Timestamp('2022-10-04 00:00:00'), Timestamp('2022-10-05 00:00:00'), Timestamp('2022-10-06 00:00:00'), Timestamp('2022-10-07 00:00:00'), Timestamp('2022-10-10 00:00:00'), Timestamp('2022-10-11 00:00:00'), Timestamp('2022-10-12 00:00:00'), Timestamp('2022-10-13 00:00:00'), Timestamp('2022-10-14 00:00:00'), Timestamp('2022-10-17 00:00:00'), Timestamp('2022-10-18 00:00:00'), Timestamp('2022-10-19 00:00:00'), Timestamp('2022-10-20 00:00:00'), Timestamp('2022-10-21 00:00:00'), Timestamp('2022-10-24 00:00:00'), Timestamp('2022-10-25 00:00:00'), Timestamp('2022-10-26 00:00:00'), Timestamp('2022-10-27 00:00:00'), Timestamp('2022-10-28 00:00:00'), Timestamp('2022-10-31 00:00:00'), Timestamp('2022-11-01 00:00:00'), Timestamp('2022-11-02 00:00:00'), Timestamp('2022-11-03 00:00:00'), Timestamp('2022-11-04 00:00:00'), Timestamp('2022-11-07 00:00:00'), Timestamp('2022-11-08 00:00:00'), Timestamp('2022-11-09 00:00:00'), Timestamp('2022-11-10 00:00:00'), Timestamp('2022-11-11 00:00:00'), Timestamp('2022-11-14 00:00:00'), Timestamp('2022-11-15 00:00:00'), Timestamp('2022-11-16 00:00:00'), Timestamp('2022-11-17 00:00:00'), Timestamp('2022-11-18 00:00:00'), Timestamp('2022-11-21 00:00:00'), Timestamp('2022-11-22 00:00:00'), Timestamp('2022-11-23 00:00:00'), Timestamp('2022-11-24 00:00:00'), Timestamp('2022-11-25 00:00:00'), Timestamp('2022-11-28 00:00:00'), Timestamp('2022-11-29 00:00:00'), Timestamp('2022-11-30 00:00:00'), Timestamp('2022-12-01 00:00:00'), Timestamp('2022-12-02 00:00:00'), Timestamp('2022-12-05 00:00:00'), Timestamp('2022-12-06 00:00:00'), Timestamp('2022-12-07 00:00:00'), Timestamp('2022-12-08 00:00:00'), Timestamp('2022-12-09 00:00:00'), Timestamp('2022-12-12 00:00:00'), Timestamp('2022-12-13 00:00:00'), Timestamp('2022-12-14 00:00:00'), Timestamp('2022-12-15 00:00:00'), Timestamp('2022-12-16 00:00:00'), Timestamp('2022-12-19 00:00:00'), Timestamp('2022-12-20 00:00:00'), Timestamp('2022-12-21 00:00:00'), Timestamp('2022-12-22 00:00:00'), Timestamp('2022-12-23 00:00:00'), Timestamp('2022-12-26 00:00:00'), Timestamp('2022-12-27 00:00:00'), Timestamp('2022-12-28 00:00:00'), Timestamp('2022-12-29 00:00:00'), Timestamp('2022-12-30 00:00:00')] not in index"

In [94]:
display(y)
display(X)


date
2018-01-02    347.24
2018-01-03    355.10
2018-01-04    356.23
2018-01-05    362.22
2018-01-08    365.45
               ...  
2019-08-05    902.45
2019-08-06    895.73
2019-08-07    897.43
2019-08-08    895.06
2019-08-09    871.58
Name: EMBI, Length: 400, dtype: float64

Unnamed: 0_level_0,const,year,month,day,WC,Funct,TotPron,PronPer,Yo,Nosotro,...,Logro,Placer,Hogar,Dinero,Relig,Muerte,Asentir,NoFluen,Relleno,Obs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,1.0,2018.0,1.0,1.0,75.200000,32.133333,10.933333,5.800000,0.866667,0.133333,...,0.466667,0.333333,0.000000,0.533333,0.066667,0.000000,0.200000,0.000000,0.0,15
2018-01-02,1.0,2018.0,1.0,2.0,44.258621,20.517241,6.758621,3.534483,0.551724,0.068966,...,0.448276,0.155172,0.000000,0.137931,0.086207,0.068966,0.189655,0.000000,0.0,58
2018-01-03,1.0,2018.0,1.0,3.0,48.416667,21.236111,6.263889,3.569444,0.402778,0.263889,...,0.652778,0.625000,0.097222,0.250000,0.041667,0.069444,0.083333,0.000000,0.0,72
2018-01-04,1.0,2018.0,1.0,4.0,51.303571,21.607143,5.732143,3.285714,0.375000,0.071429,...,0.410714,0.500000,0.035714,0.232143,0.089286,0.017857,0.125000,0.000000,0.0,56
2018-01-05,1.0,2018.0,1.0,5.0,84.810811,35.770270,10.337838,5.148649,1.135135,0.081081,...,1.013514,0.243243,0.054054,0.324324,0.040541,0.067568,0.189189,0.000000,0.0,74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,1.0,2022.0,12.0,26.0,61.473684,27.842105,8.368421,4.947368,0.421053,0.105263,...,0.894737,0.315789,0.000000,0.526316,0.105263,0.052632,0.105263,0.105263,0.0,19
2022-12-27,1.0,2022.0,12.0,27.0,67.119048,29.261905,8.666667,5.238095,0.833333,0.142857,...,0.738095,0.595238,0.428571,0.642857,0.023810,0.119048,0.285714,0.023810,0.0,42
2022-12-28,1.0,2022.0,12.0,28.0,120.071429,27.428571,9.428571,5.071429,0.571429,0.000000,...,0.785714,0.857143,0.071429,1.142857,0.000000,0.214286,0.142857,0.000000,0.0,14
2022-12-29,1.0,2022.0,12.0,29.0,14.454545,5.272727,1.954545,1.000000,0.090909,0.045455,...,0.181818,0.045455,0.000000,0.136364,0.000000,0.000000,0.045455,0.000000,0.0,22


In [90]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
print(model.summary())
