## Helper Functions

In [159]:
import subprocess
def c_import(library, elements=None, name=None, always_reimport= True, always_reinstall = True):
  if elements:
    import_str = f'from {library} import {", ".join(elements)}'
    tested_install_var = ", ".join(elements)
    any_not_installed = True if any(e not in globals() for e in elements) else False
  else:
    import_str = f'import {library}'
    tested_install_var = library
    any_not_installed = True if library not in globals() else False
  if name:
    import_str = f'{import_str} as {name}'
    tested_install_var = name
    any_not_installed = True if name not in globals() else False

  def sub_install():
    subprocess.run(f'pip install {library}', shell=True, check=True)
    print(f'Library {library} installed successfully.')

  def sub_import():
    exec(import_str, globals())
    print(f'Library {library} imported successfully. As: \n {import_str}')

  if always_reinstall:
    try:
      sub_install()
      sub_import()
    except subprocess.CalledProcessError:
      print(f'Failed to install {library}.')
    except ImportError as err:
      print(f'After Install. Import error: {err}')

  else:
    if always_reimport == True or any_not_installed == True:
        try:
          sub_import()
        except ImportError as err:
          print(f'Import error: {err}')
          if library in str(err):
            try:
                # Use subprocess to run the pip install command
                sub_install()
                sub_import()
            except subprocess.CalledProcessError:
                print(f'Failed to install {library}.')
    else:
      print(f'"{tested_install_var}" already installed and imported')

In [76]:
def zst_to_json(zst_file,output_extension='.json',remove_file=True):
  zst_name = zst_file.split('.zst')[0]
  output_file = zst_name+output_extension
  # Run the command
  subprocess.run(['zstd','-d','-o',output_file,zst_file], check=True)
  # Remove the original .zst file
  if remove_file:
    subprocess.run(['rm', zst_file])
  return print('Done!')

## Import Libraries

In [161]:
import_config = {'always_reimport': False, 'always_reinstall': False}
c_import('pandas',name='pd',**import_config)
c_import('csv',**import_config)
c_import('json',**import_config)
c_import('os',**import_config)
c_import('subprocess',**import_config)
c_import('tqdm.notebook',['tqdm'],**import_config)
c_import('concurrent.futures',['ProcessPoolExecutor'],**import_config)
c_import('datetime',**import_config)
c_import('argparse', **import_config)
c_import('codecs', **import_config)
c_import('os', **import_config)
c_import('sys', **import_config)
c_import('numpy',name='np', **import_config)
h

"pd" already installed and imported
"csv" already installed and imported
"json" already installed and imported
"os" already installed and imported
"subprocess" already installed and imported
"tqdm" already installed and imported
"ProcessPoolExecutor" already installed and imported
"datetime" already installed and imported
"argparse" already installed and imported
"codecs" already installed and imported
"os" already installed and imported
"sys" already installed and imported
"np" already installed and imported


## Directories

In [162]:
input_dir = "../input/reddit_input/"
output_dir = "../output/reddit_output/"
# split_files_dir = input_dir+'split_data/' #split JSONs output
liwcdic_file_dir = '../input/Spanish_LIWC2007_Dictionary.dic'
# Specify the folder path where you want to search for .zst files
zst_files_dir = '/Volumes/Drakôn Kholkikos - 2TB/Tesis-Grado/input/reddit_input/'

## Model Functions

### Function: JSON Lines into Chunks
We do this to be able to handle the large files

In [79]:
def json_file_splitter(filename,lines_per_chunk = 1000):
    #large JSON file
    large_json_dir = os.path.join(input_dir, filename)# Save Large JSON path

    large_json_name = os.path.splitext(filename)[0]
    split_folder_name = f'{large_json_name}-split_data'
    split_files_dir = os.path.join(input_dir, split_folder_name) 

    # Create if output folder doesn't exist
    if not os.path.exists(split_files_dir):
        os.makedirs(split_files_dir)

    # Get the total number of lines in the input file to set up the progress bar
    total_lines = sum(1 for line in open(large_json_dir))
    print(total_lines)
    with open(large_json_dir, "r") as infile:
        data = []
        chunk_count = 1

        # Create a tqdm progress bar
        pbar = tqdm(total=total_lines, desc="Splitting JSON")

        for line in infile:
            data.append(json.loads(line))

            if len(data) == lines_per_chunk:
                with open(os.path.join(split_files_dir, f"chunk_{chunk_count}.json"), "w") as outfile:
                    for item in data:
                        json.dump(item, outfile)
                        outfile.write("\n")
                data = []
                chunk_count += 1

            pbar.update(1)  # Update the progress bar

        # Write the last chunk
        with open(os.path.join(split_files_dir, f"chunk_{chunk_count}.json"), "w") as outfile:
            for item in data:
                json.dump(item, outfile)
                outfile.write("\n")

        # Close the progress bar
        pbar.close()

### Function: Process JSON Chunks into .CSV 
| id | year | month | day | title | text | permalink | 
| --- | --- | --- | --- | --- | --- | --- |

in multithread

In [80]:
# @title JSON Key Values
json_keys = {
  'time': 'created_utc',
  'id': 'id',
  'link_path': 'permalink',
  'title':'title',
  'texts': {'text':'body'}
}

In [81]:
'''
def json_chunks_processing(large_filename, json_keys):
    file = os.path.splitext(large_filename)[0] #without the .json
    # Directory where the JSON files are located
    json_files_dir = f'../input/reddit_input/{file}-split_data/'

    # Directory where the CSV file is located
    csv_table_dir = output_dir + f'{file}-output_table.csv'

    # Define the column names for the CSV
    columns = ['id', 'year', 'month', 'day', 'title', 'text', 'permalink']
    counter=0
    # Open the CSV file for writing in APPEND mode
    with open(csv_table_dir, 'a') as csv_file:
        # Check if the CSV file is empty
        csv_empty = os.path.getsize(csv_table_dir) == 0

        if csv_empty:
            # If the file is empty, write the headers
            pd.DataFrame(columns=columns).to_csv(csv_file, index=False, header=True, sep=',')

        for filename in os.listdir(json_files_dir): # Loop through all files in the specified folder
            if filename.endswith('.json'):
                # Load a JSON file
                with open(os.path.join(json_files_dir, filename), 'r') as json_file:
                    # Initialize an empty list to hold the data for this file
                    data_chunk = []

                    for line in json_file:
                        # Parse each line as JSON
                        json_data = json.loads(line)
                        # Extract the relevant data
                        id = json_data.get(json_keys['id'],'')
                        time = json_data.get(json_keys['time'],'')
                        title = json_data.get(json_keys['title'],'')
                        text = json_data.get(json_keys['text'],'')
                        link_path = json_data.get(json_keys['link_path'],'')
                        
                        created_datetime = datetime.datetime.fromtimestamp(int(time))
                        year, month, day = created_datetime.strftime('%Y-%m-%d').split('-')
                        
                        # Check if the date falls within the desired range
                        current_date = datetime.date(int(year),int(month),int(day))
                        if datetime.date(2018,1,1) <= current_date <= datetime.date(2022,12,31):
                            row = [
                                id,
                                year,
                                month,
                                day,
                                title,
                                text,
                                link_path] 
                            data_chunk.append(row)
                            counter += 1
                    # Create a DataFrame for this file
                    chunk_df = pd.DataFrame(data_chunk, columns=columns)

                    #DROP TITLE if its a comment
                    # Check if the "title" column is empty (other than the header)
                    title_column = chunk_df['title']
                    if title_column.iloc[1:].str.strip().dropna().empty:
                        chunk_df = chunk_df.drop('title', axis=1)

                    # Append this file's data to the CSV file
                    chunk_df.to_csv(csv_file, header=False, index=False, sep=',', quoting=csv.QUOTE_ALL, escapechar='\\')
                    # chunk_df.to_csv(csv_file, header=False, index=False, sep=',', quoting=csv.QUOTE_NONNUMERIC, escapechar='\\')

                # Clear the DataFrame to release memory
                chunk_df = None
    print('Stored posts:',counter)
    # Close the CSV file
    csv_file.close()
'''

'\ndef json_chunks_processing(large_filename, json_keys):\n    file = os.path.splitext(large_filename)[0] #without the .json\n    # Directory where the JSON files are located\n    json_files_dir = f\'../input/reddit_input/{file}-split_data/\'\n\n    # Directory where the CSV file is located\n    csv_table_dir = output_dir + f\'{file}-output_table.csv\'\n\n    # Define the column names for the CSV\n    columns = [\'id\', \'year\', \'month\', \'day\', \'title\', \'text\', \'permalink\']\n    counter=0\n    # Open the CSV file for writing in APPEND mode\n    with open(csv_table_dir, \'a\') as csv_file:\n        # Check if the CSV file is empty\n        csv_empty = os.path.getsize(csv_table_dir) == 0\n\n        if csv_empty:\n            # If the file is empty, write the headers\n            pd.DataFrame(columns=columns).to_csv(csv_file, index=False, header=True, sep=\',\')\n\n        for filename in os.listdir(json_files_dir): # Loop through all files in the specified folder\n            

In [82]:
def json_chunks_processing(large_filename, json_keys):
  file_name = os.path.splitext(large_filename)[0] #without the .json
  # Directory where the JSON files are located
  json_files_dir = f'../input/reddit_input/{file_name}-split_data/'

  # Directory where the CSV file is located
  csv_table_dir = output_dir + f'{file_name}-output_table.csv'
  
  stored_post_counter = 0

  with open(csv_table_dir, 'w') as csv_file: # Check if the CSV file is empty
    csv_empty = os.path.getsize(csv_table_dir) == 0

    if csv_empty:
      # If the file is empty, write the headers
      columns = []
      for key in json_keys['texts']:    
        columns.append(key)
      columns = ['id', 'year', 'month', 'day']+columns+['permalink']
      pd.DataFrame(columns=columns).to_csv(csv_file, index=False, header=True, sep=',')

  for chunk_name in tqdm(os.listdir(json_files_dir),desc=file_name): # Loop through all files in the specified folder
    if chunk_name.endswith('.json'):
      chunk_dir = os.path.join(json_files_dir, chunk_name)
      data_chunk = [] # Initialize an empty list to hold the data for this file
      with open(chunk_dir, 'r') as json_file:
        # for line in tqdm(json_file,desc=f'Now processing: {os.path.splitext(chunk_name)[0]}::'): 
        for line in json_file: 
          # Process each line here
          json_data = json.loads(line)
          row = []

          for key in json_keys['texts']:
            value = json_data.get(json_keys['texts'][key],'')
            row.append(value)
          
          id = json_data.get(json_keys['id'],'')
          created_utc = json_data.get(json_keys['time'],'')
          link_path = json_data.get(json_keys['link_path'],'')
          timestamp = datetime.datetime.fromtimestamp(int(created_utc))
          year, month, day = timestamp.strftime('%Y-%m-%d').split('-')
          
          row = [id, year, month, day]+row+[link_path]
          # Format the datetime object as "YYYY MM DD"
          # formatted_date = timestamp.strftime("%Y %m %d")
          if 1514764802 <= int(created_utc) <= 1672531199 :       
            try:
              data_chunk.append(row)
              stored_post_counter += 1
            except:
              continue

      with open(csv_table_dir, mode='a', newline='') as _file:
        _writer = csv.writer(_file, delimiter=',')
        for data in data_chunk:
            _writer.writerow(data)

      data_chunk = []
  print('Stored posts:',stored_post_counter)

### LIWC Function Setup

In [6]:
class liwc:

    def load_liwc_dict(self, liwcdic_file):
        file_content = codecs.open(liwcdic_file, "r", "utf-8").read()
        cate_text = file_content[file_content.find("%")+1:file_content[1:].find("%")].strip()
        for line in cate_text.split("\n"):
            self.liwc_cate_name_by_number[int(line.strip().split("\t")[0])] = line.strip().split("\t")[1]

        dict_text = file_content[file_content[1:].find("%")+2:].strip()
        for line in dict_text.split("\n"):
            self.liwc_cate_number_by_word[line.strip().split("\t")[0]] = set([int(item) for item in line.strip().split("\t")[1:]])

    def __init__(self, liwcdic_file=liwcdic_file_dir):

        self.liwc_category_names = ["WC",'Funct', 'TotPron', 'PronPer', 'Yo', 'Nosotro', 'TuUtd', 'ElElla', 'Ellos', 'PronImp', 'Articulo', 'Verbos', 'VerbAux', 'Pasado', 'Present', 'Futuro', 'Adverb', 'Prepos', 'Conjunc', 'Negacio', 'Cuantif', 'Numeros', 'Maldec', 'verbYO', 'verbTU', 'verbNOS', 'verbosEL', 'verbELLOS', 'Subjuntiv', 'VosUtds', 'formal', 'informal', 'verbVos', 'Social', 'Familia', 'Amigos', 'Humanos', 'Afect', 'EmoPos', 'EmoNeg', 'Ansiedad', 'Enfado', 'Triste', 'MecCog', 'Insight', 'Causa', 'Discrep', 'Tentat', 'Certeza', 'Inhib', 'Incl', 'Excl', 'Percept', 'Ver', 'Oir', 'Sentir', 'Biolog', 'Cuerpo', 'Salud', 'Sexual', 'Ingerir', 'Relativ', 'Movim', 'Espacio', 'Tiempo', 'Trabajo', 'Logro', 'Placer', 'Hogar', 'Dinero', 'Relig', 'Muerte', 'Asentir', 'NoFluen', 'Relleno']
        self.liwc_cate_name_by_number = {}
        self.liwc_cate_number_by_word = {}

        if os.path.exists(liwcdic_file) == False:

            sys.exit()
        else:
            self.load_liwc_dict(liwcdic_file)

    def getLIWCCount(self, text):
        count_by_categories = {"WC":0,'Funct': 0, 'TotPron': 0, 'PronPer': 0, 'Yo': 0, 'Nosotro': 0, 'TuUtd': 0, 'ElElla': 0, 'Ellos': 0, 'PronImp': 0, 'Articulo': 0, 'Verbos': 0, 'VerbAux': 0, 'Pasado': 0, 'Present': 0, 'Futuro': 0, 'Adverb': 0, 'Prepos': 0, 'Conjunc': 0, 'Negacio': 0, 'Cuantif': 0, 'Numeros': 0, 'Maldec': 0, 'verbYO': 0, 'verbTU': 0, 'verbNOS': 0, 'verbosEL': 0, 'verbELLOS': 0, 'Subjuntiv': 0, 'VosUtds': 0, 'formal': 0, 'informal': 0, 'verbVos': 0, 'Social': 0, 'Familia': 0, 'Amigos': 0, 'Humanos': 0, 'Afect': 0, 'EmoPos': 0, 'EmoNeg': 0, 'Ansiedad': 0, 'Enfado': 0, 'Triste': 0, 'MecCog': 0, 'Insight': 0, 'Causa': 0, 'Discrep': 0, 'Tentat': 0, 'Certeza': 0, 'Inhib': 0, 'Incl': 0, 'Excl': 0, 'Percept': 0, 'Ver': 0, 'Oir': 0, 'Sentir': 0, 'Biolog': 0, 'Cuerpo': 0, 'Salud': 0, 'Sexual': 0, 'Ingerir': 0, 'Relativ': 0, 'Movim': 0, 'Espacio': 0, 'Tiempo': 0, 'Trabajo': 0, 'Logro': 0, 'Placer': 0, 'Hogar': 0, 'Dinero': 0, 'Relig': 0, 'Muerte': 0, 'Asentir': 0, 'NoFluen': 0, 'Relleno': 0}

        count_by_categories["WC"] = len(text.split())

        for word in text.split():

            cate_numbers_word_belongs = set([])
            if word in self.liwc_cate_number_by_word:
                cate_numbers_word_belongs = self.liwc_cate_number_by_word[word]

            else:

                #liwc words have *. eg: balcon*
                word = word[:-1]
                while len(word) > 0:
                    if (word+"*") in self.liwc_cate_number_by_word:
                        cate_numbers_word_belongs = self.liwc_cate_number_by_word[word+"*"]
                        break
                    else:
                        word = word[:-1]

            for num in cate_numbers_word_belongs:
                count_by_categories[self.liwc_cate_name_by_number[num]] += 1

        return count_by_categories

## Setup

#### Turn All ZST files into JSON in folder (for Reddir archive downloads)

In [84]:
# Initialize an empty list to store the .zst file names
zst_file_names = []

# Iterate over the files in the folder
for file_name in os.listdir(zst_files_dir):
    if file_name.endswith('.zst'):
        zst_file_names.append(file_name)

# Print the list of .zst file names
for file_name in zst_file_names:
    print(file_name)
for file_name in zst_file_names:
  zst_file = folder_path+file_name
  zst_to_json(zst_file,output_extension='.json',remove_file=True)

#### Preprocess JSON files (split into chunks & turn to .csv)

In [85]:
reddit_files = [
  # 'argentina_submissions.json',
  # 'argentina_comments.json',
  # 'Republica_Argentina_submissions.json',
  # 'Republica_Argentina_comments.json',
  # 'RepublicaArgentina_submissions.json',
  'RepublicaArgentina_comments.json',
]
for reddit_file in reddit_files:
  # Part 1 - Split JSON for low RAM usage
  # json_file_splitter(reddit_file, lines_per_chunk= 5000)
  
  # Part 2 - Tranform and filter to CSV
  # if 'comments' in reddit_file: 
  #   json_keys['text'] = {'text':'body'}
  # elif 'submissions' in reddit_file:
  #   json_keys['text'] = {'text':'selftext','title':'title'}
  json_chunks_processing(large_filename = reddit_file, json_keys=json_keys)

RepublicaArgentina_comments:   0%|          | 0/66 [00:00<?, ?it/s]

KeyboardInterrupt: 

##### Testing if .csv is stored correctly

In [None]:
# Specify the path to your CSV file
csv_file_path = '../output/reddit_output/argentina_submissions-output_table.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path,sep=',', escapechar='\\')

(df.loc[df['id'] == 'acavsj'])['text'].values[0]
# Now you can work with the DataFrame, for example, you can display the first few rows:
print(df.head())

for i in range(0,100):
  row = df.iloc[i]
  print(f'========== {i} ==========')
  print(row['id'])
  print(row['text'])
  print('=======================')

## LIWC

In [7]:
reddit_files_titles = [
  # 'argentina_submissions',
  # 'argentina_comments',
  # 'Republica_Argentina_submissions',
  # 'Republica_Argentina_comments',
  'RepublicaArgentina_submissions',
  # 'RepublicaArgentina_comments',
]

In [93]:
def remove_invalids(col='', df=''):
  invalid_options = [np.nan, 'nan', None, 0, '0', 'NaN', '[deleted]', '[removed]']
  df = df.dropna(subset=[col], how='all')
  df = df[~df[col].isin(invalid_options)]
  return df

for source in reddit_files_titles:

  source_dir = f'../output/reddit_output/{source}-output_table.csv'
  source_output_dir = f'../output/reddit_output/filtered_tables_LIWC_count/{source}-liwc_output.csv'

  df_output_table = pd.read_csv(source_dir,sep=',')

  df_output_table = remove_invalids('text', df_output_table)

  if 'title' in df_output_table.columns: #if its a submission (has title)
      #remove invalid/incomplete obs
      df_output_table = remove_invalids('title', df_output_table)
      #join title and text (yes fillna just in case something passed previous cleanup)
      df_output_table['text'] = df_output_table['title'].fillna('') + '\n' + df_output_table['text'].fillna('')
      #now drop it
      df_output_table = df_output_table.drop(columns=['title'])
      
  tqdm.pandas(desc = f'Computing {source} LIWC')
  # Assuming you have the LIWC counts as a Series

  '''
  liwc_counts = df_output_table['text'].progress_apply(liwc().getLIWCCount)

  # Create new DataFrame from the LIWC counts
  liwc_df = pd.DataFrame(liwc_counts.tolist())
  
  # Concatenate the new DataFrame with the original DataFrame
  df_output_table = pd.concat([df_output_table, liwc_df], axis=1)

  #Now your DataFrame will have columns for each LIWC category
  df_output_table.to_csv(source_output_dir, index=False, sep=',', quoting=csv.QUOTE_ALL, escapechar='\\')
  '''

In [123]:
import re

LIWC_ALL = []
LIWC_dummy = liwc().getLIWCCount('')
HEADERS = list(LIWC_dummy.keys())
HEADERS.sort()
#print(HEADERS)
liwc_vector = []
for head in HEADERS:
    liwc_vector.append(head)

LIWC_ALL.append(liwc_vector)    
for index, row in tqdm(df_output_table.iterrows()):
    #print('Row Index:', index)
    
    text = row['text'].lower().replace('\n',' ')
    #print(text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r' +',' ',text)
    LIWC_raw = liwc().getLIWCCount(text)
    LIWC_norm = {}
    for item in LIWC_raw:
        if item != 'WC':    #==
            LIWC_norm[item] = LIWC_raw[item]/LIWC_raw['WC']
        else:
            LIWC_norm['WC'] = LIWC_raw['WC']
    #print(LIWC_norm)
    liwc_vector = []    
    for head in HEADERS:
        liwc_vector.append(LIWC_norm[head])

    LIWC_ALL.append(liwc_vector)
    # You can access specific column values using row['Column_Name']
    #print()
  

0it [00:00, ?it/s]

In [133]:
LIWC_ALL = []

LIWC_dummy = liwc().getLIWCCount('')
HEADERS = list(LIWC_dummy.keys())
HEADERS.sort()


liwc_vector = []

for head in HEADERS:
    liwc_vector.append(head)

LIWC_ALL.append(liwc_vector)

data_list = []

for index, row in tqdm(df_output_table.iterrows()):
    text = row['text'].lower().replace('\n', ' ')
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r' +', ' ', text)
    #TIDphto +append    
    LIWC_raw = liwc().getLIWCCount(text)
    LIWC_norm = {}
    
    for item in LIWC_raw:
        if item != 'WC':
            LIWC_norm[item] = LIWC_raw[item] / LIWC_raw['WC']
        else:
            LIWC_norm['WC'] = LIWC_raw['WC']
    
    liwc_vector = {}
    
    for head in HEADERS:
        liwc_vector[head] = LIWC_norm[head]
    
    data_list.append(liwc_vector)

# Create a DataFrame from the list of dictionaries
liwc_df = pd.DataFrame(data_list)


2918it [00:55, 52.19it/s]


Unnamed: 0,Adverb,Afect,Amigos,Ansiedad,Articulo,Asentir,Biolog,Causa,Certeza,Conjunc,...,WC,Yo,formal,informal,verbELLOS,verbNOS,verbTU,verbVos,verbYO,verbosEL
0,0.048780,0.024390,0.000000,0.000000,0.111498,0.003484,0.017422,0.006969,0.017422,0.076655,...,287,0.024390,0.0,0.003484,0.006969,0.000000,0.003484,0.0,0.034843,0.041812
1,0.045455,0.012987,0.000000,0.000000,0.058442,0.000000,0.019481,0.032468,0.012987,0.051948,...,154,0.006494,0.0,0.000000,0.032468,0.000000,0.000000,0.0,0.012987,0.032468
2,0.059809,0.026316,0.002392,0.002392,0.093301,0.007177,0.014354,0.009569,0.019139,0.062201,...,418,0.028708,0.0,0.000000,0.009569,0.004785,0.000000,0.0,0.023923,0.031100
3,0.074830,0.020408,0.006803,0.000000,0.074830,0.000000,0.034014,0.006803,0.006803,0.095238,...,147,0.006803,0.0,0.000000,0.020408,0.000000,0.000000,0.0,0.034014,0.047619
4,0.033019,0.028302,0.004717,0.009434,0.084906,0.004717,0.009434,0.033019,0.009434,0.070755,...,212,0.009434,0.0,0.009434,0.000000,0.004717,0.000000,0.0,0.014151,0.080189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2913,0.059028,0.024306,0.000000,0.006944,0.159722,0.000000,0.017361,0.031250,0.010417,0.093750,...,288,0.003472,0.0,0.000000,0.003472,0.000000,0.000000,0.0,0.013889,0.034722
2914,0.028571,0.021429,0.000000,0.000000,0.085714,0.000000,0.000000,0.014286,0.007143,0.042857,...,140,0.014286,0.0,0.007143,0.000000,0.000000,0.007143,0.0,0.042857,0.042857
2915,0.069930,0.006993,0.000000,0.000000,0.048951,0.000000,0.006993,0.013986,0.000000,0.062937,...,143,0.013986,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.048951,0.090909
2916,0.000000,0.000000,0.000000,0.000000,0.058824,0.000000,0.000000,0.000000,0.000000,0.000000,...,17,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000


In [None]:
# Print the resulting DataFrame
display(liwc_df)


In [137]:
print(len(liwc_df))
print(len(df_output_table))
df_output_table = df_output_table.reset_index(drop=True)
# LIWC_df = pd.DataFrame(LIWC_ALL, columns=HEADERS)
merge_df = pd.concat([df_output_table, liwc_df], axis=1)


2918
2918


In [138]:
print(len(merge_df))

# df_output_table = df_output_table.reset_index(drop=True)
# df_output_table.tail(50)


2918


In [142]:
with pd.option_context('display.max_columns', None):
    display(merge_df)

Unnamed: 0,id,year,month,day,text,permalink,Adverb,Afect,Amigos,Ansiedad,Articulo,Asentir,Biolog,Causa,Certeza,Conjunc,Cuantif,Cuerpo,Dinero,Discrep,ElElla,Ellos,EmoNeg,EmoPos,Enfado,Espacio,Excl,Familia,Funct,Futuro,Hogar,Humanos,Incl,Ingerir,Inhib,Insight,Logro,Maldec,MecCog,Movim,Muerte,Negacio,NoFluen,Nosotro,Numeros,Oir,Pasado,Percept,Placer,Prepos,Present,PronImp,PronPer,Relativ,Relig,Relleno,Salud,Sentir,Sexual,Social,Subjuntiv,Tentat,Tiempo,TotPron,Trabajo,Triste,TuUtd,Ver,VerbAux,Verbos,VosUtds,WC,Yo,formal,informal,verbELLOS,verbNOS,verbTU,verbVos,verbYO,verbosEL
0,7np9h2,2018,1,2,[SERIO]Que opinas de dejar propina?\nQuiero sa...,/r/RepublicaArgentina/comments/7np9h2/serioque...,0.048780,0.024390,0.000000,0.000000,0.111498,0.003484,0.017422,0.006969,0.017422,0.076655,0.017422,0.003484,0.031359,0.031359,0.069686,0.017422,0.003484,0.017422,0.000000,0.017422,0.020906,0.000000,0.463415,0.0,0.013937,0.000000,0.045296,0.013937,0.024390,0.020906,0.048780,0.000000,0.233449,0.013937,0.000000,0.020906,0.0,0.000000,0.006969,0.003484,0.017422,0.024390,0.003484,0.108014,0.125436,0.059233,0.101045,0.059233,0.000000,0.0,0.003484,0.000000,0.000000,0.052265,0.013937,0.045296,0.031359,0.160279,0.052265,0.000000,0.003484,0.003484,0.006969,0.146341,0.000000,287,0.024390,0.0,0.003484,0.006969,0.000000,0.003484,0.0,0.034843,0.041812
1,7nq6jb,2018,1,2,"Si en Argentina hay neutralidad de red, ¿cómo ...",/r/RepublicaArgentina/comments/7nq6jb/si_en_ar...,0.045455,0.012987,0.000000,0.000000,0.058442,0.000000,0.019481,0.032468,0.012987,0.051948,0.045455,0.000000,0.006494,0.006494,0.045455,0.019481,0.019481,0.012987,0.012987,0.038961,0.032468,0.000000,0.500000,0.0,0.000000,0.000000,0.058442,0.012987,0.000000,0.051948,0.006494,0.000000,0.285714,0.025974,0.000000,0.032468,0.0,0.000000,0.012987,0.006494,0.006494,0.038961,0.012987,0.175325,0.116883,0.103896,0.051948,0.058442,0.000000,0.0,0.000000,0.000000,0.012987,0.077922,0.012987,0.051948,0.000000,0.155844,0.032468,0.000000,0.000000,0.012987,0.012987,0.136364,0.000000,154,0.006494,0.0,0.000000,0.032468,0.000000,0.000000,0.0,0.012987,0.032468
2,7nqghb,2018,1,2,[Consulta] ¿Comunicación médico-paciente en ca...,/r/RepublicaArgentina/comments/7nqghb/consulta...,0.059809,0.026316,0.002392,0.002392,0.093301,0.007177,0.014354,0.009569,0.019139,0.062201,0.021531,0.000000,0.000000,0.019139,0.064593,0.011962,0.004785,0.014354,0.002392,0.043062,0.026316,0.004785,0.519139,0.0,0.007177,0.004785,0.066986,0.004785,0.002392,0.050239,0.011962,0.000000,0.263158,0.009569,0.007177,0.021531,0.0,0.000000,0.019139,0.016746,0.009569,0.038278,0.009569,0.133971,0.124402,0.081340,0.095694,0.076555,0.000000,0.0,0.009569,0.000000,0.000000,0.095694,0.019139,0.062201,0.035885,0.177033,0.031100,0.000000,0.014354,0.004785,0.014354,0.143541,0.002392,418,0.028708,0.0,0.000000,0.009569,0.004785,0.000000,0.0,0.023923,0.031100
3,7o5gzl,2018,1,4,"Consulta a los mods, por banderitas (¿flair?)\...",/r/RepublicaArgentina/comments/7o5gzl/consulta...,0.074830,0.020408,0.006803,0.000000,0.074830,0.000000,0.034014,0.006803,0.006803,0.095238,0.047619,0.000000,0.000000,0.006803,0.054422,0.040816,0.006803,0.013605,0.006803,0.040816,0.061224,0.000000,0.496599,0.0,0.000000,0.000000,0.020408,0.034014,0.000000,0.020408,0.006803,0.000000,0.306122,0.020408,0.000000,0.034014,0.0,0.000000,0.034014,0.020408,0.013605,0.074830,0.006803,0.074830,0.170068,0.136054,0.061224,0.054422,0.000000,0.0,0.000000,0.000000,0.000000,0.088435,0.034014,0.074830,0.006803,0.197279,0.000000,0.000000,0.000000,0.006803,0.006803,0.190476,0.000000,147,0.006803,0.0,0.000000,0.020408,0.000000,0.000000,0.0,0.034014,0.047619
4,7o7cup,2018,1,4,Facundoaranagate\n**Resumen:**\n\nFacundo Aran...,/r/RepublicaArgentina/comments/7o7cup/facundoa...,0.033019,0.028302,0.004717,0.009434,0.084906,0.004717,0.009434,0.033019,0.009434,0.070755,0.042453,0.004717,0.004717,0.004717,0.056604,0.037736,0.018868,0.004717,0.009434,0.051887,0.033019,0.000000,0.504717,0.0,0.009434,0.004717,0.080189,0.004717,0.009434,0.042453,0.014151,0.000000,0.273585,0.028302,0.000000,0.023585,0.0,0.000000,0.000000,0.014151,0.028302,0.023585,0.009434,0.127358,0.108491,0.094340,0.075472,0.117925,0.000000,0.0,0.000000,0.000000,0.000000,0.113208,0.009434,0.066038,0.042453,0.169811,0.009434,0.000000,0.009434,0.004717,0.023585,0.150943,0.000000,212,0.009434,0.0,0.009434,0.000000,0.004717,0.000000,0.0,0.014151,0.080189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2913,zu19gt,2022,12,24,Idea para solucionar el quilombo de la coparti...,/r/RepublicaArgentina/comments/zu19gt/idea_par...,0.059028,0.024306,0.000000,0.006944,0.159722,0.000000,0.017361,0.031250,0.010417,0.093750,0.027778,0.003472,0.017361,0.017361,0.079861,0.020833,0.010417,0.010417,0.010417,0.062500,0.038194,0.000000,0.534722,0.0,0.003472,0.000000,0.079861,0.013889,0.000000,0.024306,0.013889,0.003472,0.239583,0.024306,0.000000,0.017361,0.0,0.000000,0.017361,0.003472,0.006944,0.024306,0.017361,0.118056,0.090278,0.055556,0.083333,0.083333,0.000000,0.0,0.000000,0.000000,0.000000,0.052083,0.006944,0.024306,0.000000,0.138889,0.010417,0.000000,0.000000,0.010417,0.003472,0.121528,0.000000,288,0.003472,0.0,0.000000,0.003472,0.000000,0.000000,0.0,0.013889,0.034722
2914,zugt2e,2022,12,24,Impuestos de aduana para una figura articulada...,/r/RepublicaArgentina/comments/zugt2e/impuesto...,0.028571,0.021429,0.000000,0.000000,0.085714,0.000000,0.000000,0.014286,0.007143,0.042857,0.021429,0.000000,0.057143,0.014286,0.064286,0.021429,0.000000,0.021429,0.000000,0.050000,0.014286,0.000000,0.500000,0.0,0.000000,0.000000,0.071429,0.000000,0.000000,0.035714,0.007143,0.000000,0.228571,0.021429,0.000000,0.021429,0.0,0.000000,0.007143,0.014286,0.014286,0.021429,0.007143,0.142857,0.107143,0.100000,0.085714,0.085714,0.000000,0.0,0.000000,0.000000,0.000000,0.100000,0.000000,0.035714,0.014286,0.185714,0.035714,0.000000,0.014286,0.000000,0.007143,0.128571,0.000000,140,0.014286,0.0,0.007143,0.000000,0.000000,0.007143,0.0,0.042857,0.042857
2915,zvnumc,2022,12,26,duda sobre la reciente apertura de portabilida...,/r/RepublicaArgentina/comments/zvnumc/duda_sob...,0.069930,0.006993,0.000000,0.000000,0.048951,0.000000,0.006993,0.013986,0.000000,0.062937,0.027972,0.000000,0.000000,0.013986,0.041958,0.027972,0.000000,0.006993,0.000000,0.027972,0.027972,0.000000,0.489510,0.0,0.000000,0.000000,0.048951,0.006993,0.000000,0.027972,0.000000,0.000000,0.216783,0.020979,0.000000,0.041958,0.0,0.000000,0.006993,0.013986,0.006993,0.041958,0.000000,0.174825,0.132867,0.062937,0.055944,0.076923,0.000000,0.0,0.000000,0.006993,0.000000,0.055944,0.034965,0.090909,0.041958,0.118881,0.020979,0.000000,0.000000,0.006993,0.041958,0.146853,0.000000,143,0.013986,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.048951,0.090909
2916,zz0ksc,2022,12,30,Que le preguntarias\nQue le preguntarias a alg...,/r/RepublicaArgentina/comments/zz0ksc/que_le_p...,0.000000,0.000000,0.000000,0.000000,0.058824,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.117647,0.000000,0.058824,0.000000,0.000000,0.117647,0.000000,0.000000,0.588235,0.0,0.000000,0.000000,0.117647,0.000000,0.000000,0.000000,0.058824,0.000000,0.235294,0.000000,0.117647,0.000000,0.0,0.000000,0.058824,0.000000,0.000000,0.000000,0.000000,0.176471,0.058824,0.235294,0.117647,0.117647,0.000000,0.0,0.000000,0.000000,0.000000,0.176471,0.000000,0.058824,0.000000,0.352941,0.058824,0.058824,0.117647,0.000000,0.000000,0.058824,0.000000,17,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000


In [147]:
print_df = merge_df.copy()
print_df.drop(columns=['text','permalink'], inplace=True)

print_df.to_csv('---tmp_liwc.csv', index=False, sep=',')

In [None]:
# Assuming 'df_copy' is your DataFrame
count_non_zero_WC = (df_copy['text_WC'] != 0).sum()

print("Number of rows with 'text_WC' different from 0:", count_non_zero_WC)

NameError: name 'df_copy' is not defined

In [150]:
# for source in tqdm(reddit_files_titles):
#   source_dir = f'../output/reddit_output/filtered_tables_LIWC_count/{source}-liwc_output.csv'
#   output_dir = f'../output/reddit_output/daily_LIWC_averages/{source}-liwc_output.csv'

df = pd.read_csv('/Volumes/Drakôn Kholkikos - 2TB/Tesis-Grado/programs/---tmp_liwc.csv',sep=',')
LIWC_keys = list(
  df.columns[
    list(df.columns).index('day') + 1 #Gets the last column before LIWC count
    :  ]
  )

for key in LIWC_keys:
    df[key] = pd.to_numeric(df[key], errors='coerce')

df.head(20)

result_df = df.groupby(['year', 'month', 'day'])[LIWC_keys].mean().reset_index()

# Calculate the count of rows for each date
count_df = df.groupby(['year', 'month', 'day'])['id'].count().reset_index()
count_df.rename(columns={'id': 'Obs'}, inplace=True)

# Merge the average data with the count data
result_df = pd.merge(result_df, count_df, on=['year', 'month', 'day'])

# Save the results to a new DataFrame
# result_df.to_csv('averages_by_date_with_count.csv', index=False)
result_df.to_csv('/Volumes/Drakôn Kholkikos - 2TB/Tesis-Grado/programs/---grouped-tmp_liwc.csv', index=False, sep=',', quoting=csv.QUOTE_ALL, escapechar='\\')

In [151]:
from IPython.display import display

with pd.option_context('display.max_columns', None):
    display(result_df.head(10))


Unnamed: 0,year,month,day,Adverb,Afect,Amigos,Ansiedad,Articulo,Asentir,Biolog,Causa,Certeza,Conjunc,Cuantif,Cuerpo,Dinero,Discrep,ElElla,Ellos,EmoNeg,EmoPos,Enfado,Espacio,Excl,Familia,Funct,Futuro,Hogar,Humanos,Incl,Ingerir,Inhib,Insight,Logro,Maldec,MecCog,Movim,Muerte,Negacio,NoFluen,Nosotro,Numeros,Oir,Pasado,Percept,Placer,Prepos,Present,PronImp,PronPer,Relativ,Relig,Relleno,Salud,Sentir,Sexual,Social,Subjuntiv,Tentat,Tiempo,TotPron,Trabajo,Triste,TuUtd,Ver,VerbAux,Verbos,VosUtds,WC,Yo,formal,informal,verbELLOS,verbNOS,verbTU,verbVos,verbYO,verbosEL,Obs
0,2018,1,2,0.051348,0.021231,0.000797,0.000797,0.087747,0.003554,0.017085,0.016335,0.016516,0.063601,0.028136,0.001161,0.012617,0.018997,0.059911,0.016288,0.00925,0.014921,0.005126,0.033148,0.026563,0.001595,0.494184,0.0,0.007038,0.001595,0.056908,0.01057,0.008928,0.041031,0.022412,0.0,0.260774,0.016494,0.002392,0.024968,0.0,0.0,0.013031,0.008908,0.011161,0.033876,0.00868,0.139103,0.12224,0.08149,0.082896,0.064743,0.0,0.0,0.004351,0.0,0.004329,0.075294,0.015354,0.053148,0.022415,0.164385,0.038611,0.0,0.005946,0.007085,0.011437,0.142082,0.000797,286.333333,0.019864,0.0,0.001161,0.016335,0.001595,0.001161,0.0,0.023918,0.035127,3
1,2018,1,4,0.053924,0.024355,0.00576,0.004717,0.079868,0.002358,0.021724,0.019911,0.008118,0.082996,0.045036,0.002358,0.002358,0.00576,0.055513,0.039276,0.012835,0.009161,0.008118,0.046352,0.047122,0.0,0.500658,0.0,0.004717,0.002358,0.050298,0.019365,0.004717,0.03143,0.010477,0.0,0.289854,0.024355,0.0,0.028799,0.0,0.0,0.017007,0.01728,0.020954,0.049207,0.008118,0.101094,0.139279,0.115197,0.068348,0.086173,0.0,0.0,0.0,0.0,0.0,0.100821,0.021724,0.070434,0.024628,0.183545,0.004717,0.0,0.004717,0.00576,0.015194,0.17071,0.0,179.5,0.008118,0.0,0.004717,0.010204,0.002358,0.0,0.0,0.024082,0.063904,2
2,2018,1,6,0.020408,0.040816,0.0,0.0,0.040816,0.0,0.020408,0.0,0.0,0.061224,0.0,0.0,0.0,0.0,0.040816,0.020408,0.020408,0.020408,0.020408,0.020408,0.020408,0.0,0.469388,0.0,0.0,0.0,0.081633,0.0,0.0,0.061224,0.020408,0.0,0.244898,0.0,0.0,0.0,0.0,0.020408,0.0,0.0,0.020408,0.040816,0.020408,0.142857,0.142857,0.122449,0.061224,0.040816,0.0,0.0,0.0,0.020408,0.020408,0.061224,0.020408,0.061224,0.020408,0.183673,0.0,0.0,0.0,0.0,0.020408,0.142857,0.0,49.0,0.020408,0.0,0.0,0.020408,0.0,0.020408,0.0,0.020408,0.0,1
3,2018,1,7,0.046875,0.05125,0.0,0.0,0.1025,0.0,0.015625,0.0,0.035625,0.046875,0.05125,0.0,0.0,0.0,0.1025,0.05125,0.0,0.05125,0.0,0.055625,0.015625,0.0,0.525625,0.0,0.0,0.0,0.055625,0.015625,0.0,0.086875,0.0,0.0,0.311875,0.0,0.0,0.015625,0.0,0.0,0.02,0.0,0.0,0.066875,0.0,0.106875,0.118125,0.11125,0.1025,0.07125,0.0,0.0,0.0,0.0,0.0,0.09125,0.0,0.035625,0.015625,0.21375,0.0,0.0,0.0,0.0,0.0,0.138125,0.0,28.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.055625,2
4,2018,1,8,0.029762,0.02381,0.0,0.0,0.136905,0.0,0.005952,0.005952,0.011905,0.053571,0.011905,0.0,0.0,0.011905,0.029762,0.011905,0.0,0.029762,0.0,0.017857,0.029762,0.0,0.375,0.0,0.0,0.005952,0.041667,0.005952,0.0,0.017857,0.017857,0.005952,0.14881,0.0,0.0,0.011905,0.0,0.017857,0.011905,0.0,0.011905,0.017857,0.0,0.071429,0.089286,0.047619,0.035714,0.041667,0.005952,0.0,0.0,0.005952,0.0,0.095238,0.005952,0.017857,0.02381,0.083333,0.011905,0.0,0.005952,0.005952,0.041667,0.107143,0.0,168.0,0.005952,0.0,0.0,0.005952,0.017857,0.0,0.0,0.017857,0.035714,1
5,2018,1,9,0.042553,0.042553,0.0,0.0,0.12766,0.0,0.0,0.021277,0.0,0.042553,0.021277,0.0,0.0,0.0,0.06383,0.021277,0.0,0.06383,0.0,0.042553,0.042553,0.0,0.489362,0.0,0.0,0.0,0.021277,0.0,0.0,0.06383,0.0,0.0,0.191489,0.021277,0.0,0.0,0.0,0.0,0.042553,0.0,0.0,0.042553,0.021277,0.170213,0.085106,0.106383,0.085106,0.085106,0.0,0.0,0.0,0.021277,0.0,0.042553,0.0,0.06383,0.021277,0.191489,0.042553,0.0,0.0,0.021277,0.0,0.085106,0.021277,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021277,1
6,2018,1,11,0.1,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.55,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.3,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.05,0.1,0.15,0.05,0.0,0.05,0.0,0.0,0.0,0.0,0.1,0.0,0.05,0.0,0.2,0.0,0.0,0.05,0.0,0.0,0.1,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7,2018,1,12,0.034975,0.077362,0.0,0.0,0.122744,0.01018,0.017992,0.019456,0.009513,0.036569,0.01031,0.0,0.0,0.001701,0.032371,0.017089,0.017886,0.059475,0.014485,0.057799,0.012808,0.002604,0.457541,0.0,0.0,0.004305,0.034975,0.0,0.001701,0.040744,0.003401,0.001701,0.138272,0.005102,0.0,0.01031,0.0,0.0,0.002604,0.006803,0.014722,0.044382,0.004305,0.132745,0.111694,0.039435,0.079193,0.105346,0.0,0.0,0.0,0.02036,0.020597,0.056393,0.012011,0.012808,0.044145,0.118627,0.001701,0.001701,0.010977,0.013818,0.0,0.138296,0.0,70.75,0.045121,0.0,0.001701,0.017089,0.0,0.0,0.0,0.019693,0.037816,4
8,2018,1,14,0.075269,0.096774,0.0,0.0,0.075269,0.0,0.032258,0.010753,0.032258,0.064516,0.021505,0.010753,0.0,0.010753,0.021505,0.021505,0.0,0.096774,0.0,0.021505,0.032258,0.0,0.408602,0.0,0.0,0.0,0.032258,0.0,0.0,0.010753,0.0,0.0,0.150538,0.064516,0.0,0.021505,0.0,0.010753,0.010753,0.0,0.032258,0.0,0.0,0.107527,0.139785,0.053763,0.032258,0.129032,0.0,0.0,0.0,0.0,0.032258,0.086022,0.010753,0.032258,0.064516,0.086022,0.032258,0.0,0.0,0.0,0.032258,0.182796,0.0,93.0,0.010753,0.0,0.0,0.0,0.010753,0.0,0.0,0.075269,0.0,1
9,2018,1,17,0.041096,0.020548,0.006849,0.0,0.082192,0.013699,0.0,0.020548,0.013699,0.082192,0.020548,0.0,0.0,0.0,0.041096,0.006849,0.0,0.027397,0.0,0.061644,0.027397,0.0,0.513699,0.0,0.0,0.0,0.054795,0.0,0.006849,0.006849,0.047945,0.0,0.205479,0.034247,0.0,0.006849,0.0,0.006849,0.020548,0.027397,0.013699,0.034247,0.013699,0.089041,0.109589,0.09589,0.109589,0.109589,0.0,0.0,0.0,0.006849,0.0,0.116438,0.020548,0.041096,0.041096,0.205479,0.061644,0.0,0.034247,0.0,0.0,0.123288,0.006849,146.0,0.034247,0.0,0.027397,0.020548,0.0,0.0,0.0,0.006849,0.027397,1


In [152]:
import pandas as pd
import statsmodels.api as sm

# Load your data into a DataFrame
df_embi = pd.read_csv('/Volumes/Drakôn Kholkikos - 2TB/Tesis-Grado/input/EMBI.csv')

In [153]:
import pandas as pd
import statsmodels.api as sm

# Load your data, assuming you have already loaded df_embi and df_liwc

# Set the date range
start_date = '2018-01-01'
end_date = '2022-12-31'

# Convert 'year', 'month', and 'day' columns to datetime
df_embi['date'] = pd.to_datetime(df_embi[['year', 'month', 'day']], format='%Y-%m-%d')
result_df['date'] = pd.to_datetime(result_df[['year', 'month', 'day']], format='%Y-%m-%d')

# Filter the data to the date range
df_embi = df_embi[(df_embi['date'] >= start_date) & (df_embi['date'] <= end_date)].dropna()#.drop(columns=['year', 'month', 'day'])
df_liwc = result_df[(result_df['date'] >= start_date) & (result_df['date'] <= end_date)].dropna()#.drop(columns=['year', 'month', 'day'])

df_embi = df_embi.drop(columns=['year', 'month', 'day'])
df_liwc = df_liwc.drop(columns=['year', 'month', 'day','Obs'])

# Set 'date' as the index for both DataFrames
df_embi.set_index('date', inplace=True)
df_liwc.set_index('date', inplace=True)

# Perform an inner merge on the 'date' index to ensure common dates
merged_data = df_embi.merge(df_liwc, left_index=True, right_index=True)



In [154]:
display(merged_data.head(3))

Unnamed: 0_level_0,EMBI,Adverb,Afect,Amigos,Ansiedad,Articulo,Asentir,Biolog,Causa,Certeza,...,WC,Yo,formal,informal,verbELLOS,verbNOS,verbTU,verbVos,verbYO,verbosEL
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02,347.24,0.051348,0.021231,0.000797,0.000797,0.087747,0.003554,0.017085,0.016335,0.016516,...,286.333333,0.019864,0.0,0.001161,0.016335,0.001595,0.001161,0.0,0.023918,0.035127
2018-01-04,356.23,0.053924,0.024355,0.00576,0.004717,0.079868,0.002358,0.021724,0.019911,0.008118,...,179.5,0.008118,0.0,0.004717,0.010204,0.002358,0.0,0.0,0.024082,0.063904
2018-01-08,365.45,0.029762,0.02381,0.0,0.0,0.136905,0.0,0.005952,0.005952,0.011905,...,168.0,0.005952,0.0,0.0,0.005952,0.017857,0.0,0.0,0.017857,0.035714


In [155]:
# Separate the dependent variable (y) and independent variables (X)
y = list(merged_data['EMBI'])
X = merged_data.drop(columns=['EMBI']).values.tolist()

# Perform the regression analysis
X = sm.add_constant(X)


In [158]:
# Check the data types of y and X
#print("Data type of y:", type(y))
#print("Data type of X:", type(X))
#print(y)
#print(X)
print(len(y))
print(len(X))
Y_temp = []
for item in y:
    Y_temp.append(float(item.replace(',', '')))

y = list(Y_temp)
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and fit the linear regression model on the training data
reg = LinearRegression()
reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = reg.predict(X_test)


# Calculate statistics to evaluate the model
mse = mean_squared_error(np.array(y_test), y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared (R2) Score:", r2)

936
936
Mean Squared Error: 845035.7954909843
R-squared (R2) Score: -0.10999058308769061


In [86]:
X[0]

array([ 1.        , 35.66666667, 15.66666667,  6.        ,  2.66666667,
        0.        ,  0.33333333,  0.        ,  2.66666667,  1.33333333,
        3.33333333,  2.33333333,  4.        ,  0.33333333,  0.        ,
        4.        ,  0.        ,  1.        ,  4.33333333,  2.        ,
        0.        ,  0.66666667,  0.33333333,  0.        ,  0.66666667,
        0.33333333,  0.        ,  1.        ,  0.33333333,  0.33333333,
        0.        ,  0.        ,  0.        ,  0.        ,  2.66666667,
        0.        ,  0.        ,  0.        ,  1.66666667,  1.33333333,
        0.33333333,  0.        ,  0.33333333,  0.        ,  8.66666667,
        2.33333333,  0.        ,  0.        ,  1.66666667,  0.66666667,
        0.        ,  2.33333333,  0.66666667,  2.        ,  0.        ,
        0.        ,  0.33333333,  0.66666667,  0.        ,  0.        ,
        0.33333333,  0.33333333,  1.66666667,  0.        ,  1.33333333,
        0.33333333,  0.        ,  0.33333333,  0.33333333,  0.  

In [60]:
model = sm.OLS(y, X.astype(float))
results = model.fit()

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
model = sm.OLS(y,X)
results = model.fit()
results.params

# model = sm.OLS(y, X).fit()

# Print the regression results
# print(model.summary())


In [37]:
for x in X:
    print(x)
# X_2 = X.values.tolist()
# X_2[1][:2]

[ 1.         35.66666667 15.66666667  6.          2.66666667  0.
  0.33333333  0.          2.66666667  1.33333333  3.33333333  2.33333333
  4.          0.33333333  0.          4.          0.          1.
  4.33333333  2.          0.          0.66666667  0.33333333  0.
  0.66666667  0.33333333  0.          1.          0.33333333  0.33333333
  0.          0.          0.          0.          2.66666667  0.
  0.          0.          1.66666667  1.33333333  0.33333333  0.
  0.33333333  0.          8.66666667  2.33333333  0.          0.
  1.66666667  0.66666667  0.          2.33333333  0.66666667  2.
  0.          0.          0.33333333  0.66666667  0.          0.
  0.33333333  0.33333333  1.66666667  0.          1.33333333  0.33333333
  0.          0.33333333  0.33333333  0.          0.          0.
  0.          0.          0.          0.        ]
[ 1.  36.  16.5  5.5  2.   0.   0.   0.   2.   1.   3.5  3.   2.   0.
  0.   2.   0.   1.5  5.5  2.   1.   0.5  0.   0.   0.   0.   0.   1.
  0.  

In [20]:
print(y)#one col


['347.24', '356.23', '365.45', '360.94', '363.59', '360', '365.37', '366.58', '368.72', '376.11', '378.91', '374.5', '377.82', '398.27', '395.44', '384.38', '407.12', '428.01', '414.47', '423.85', '415.43', '392.41', '398.73', '394.81', '403.55', '405.44', '397.66', '405.11', '418.72', '413.28', '411.17', '409.11', '415.57', '407.53', '388.51', '403.9', '420.94', '412.63', '420.11', '427.19', '421.19', '419.5', '425.3', '417.38', '416.93', '419.34', '414.85', '405.57', '395.99', '398.23', '400.14', '402.62', '418.68', '419.15', '432.33', '469.33', '453.56', '488.62', '488.35', '467.46', '479.25', '480.21', '448.79', '459.53', '457.2', '453.72', '496.67', '500.59', '521.16', '511.8', '480.21', '472.92', '465.31', '478.71', '473.87', '485.66', '502.35', '549.91', '565.52', '542.57', '533.21', '546.32', '558.33', '583.91', '608.42', '563.98', '566.72', '578.1', '568.56', '575.42', '582.76', '574.46', '581.65', '571.87', '576.13', '574.35', '553.42', '548.98', '549.56', '555.77', '558.82',

In [None]:
print(X)#list of lists [35 15 6 2.6]

In [None]:
while pd.option_context('display.max_rows', None):
    display(X.dtypes)
    display(y.dtypes)


In [None]:
# Set the date range
start_date = '2018-01-01'
end_date = '2022-12-31'

# Convert 'year', 'month', and 'day' columns to datetime
df_embi['date'] = pd.to_datetime(df_embi[['year', 'month', 'day']], format='%Y-%m-%d')

# Create a boolean mask for the date range
mask = (df_embi['date'] >= start_date) & (df_embi['date'] <= end_date)

# Apply the mask to select rows within the date range
selected_data = df_embi[mask]
selected_data.set_index("date", inplace=True)

# Convert 'year', 'month', and 'day' columns to datetime
result_df['date'] = pd.to_datetime(result_df[['year', 'month', 'day']], format='%Y-%m-%d')

# Create a boolean mask for the date range
mask = (result_df['date'] >= start_date) & (result_df['date'] <= end_date)

# Apply the mask to select rows within the date range
df_liwc = result_df[mask]
df_liwc.set_index("date", inplace=True)

In [86]:
y = selected_data['EMBI']
X = df_liwc[['WC', 'Funct', 'TotPron', 'PronPer', 'Yo', 'Nosotro', 'TuUtd', 'ElElla', 'Ellos', 'PronImp',
          'Articulo', 'Verbos', 'VerbAux', 'Pasado', 'Present', 'Futuro', 'Adverb', 'Prepos', 'Conjunc',
          'Negacio', 'Cuantif', 'Numeros', 'Maldec', 'verbYO', 'verbTU', 'verbNOS', 'verbosEL',
          'verbELLOS', 'Subjuntiv', 'VosUtds', 'formal', 'informal', 'verbVos', 'Social', 'Familia',
          'Amigos', 'Humanos', 'Afect', 'EmoPos', 'EmoNeg', 'Ansiedad', 'Enfado', 'Triste', 'MecCog',
          'Insight', 'Causa', 'Discrep', 'Tentat', 'Certeza', 'Inhib', 'Incl', 'Excl', 'Percept', 'Ver',
          'Oir', 'Sentir', 'Biolog', 'Cuerpo', 'Salud', 'Sexual', 'Ingerir', 'Relativ', 'Movim', 'Espacio',
          'Tiempo', 'Trabajo', 'Logro', 'Placer', 'Hogar', 'Dinero', 'Relig', 'Muerte', 'Asentir', 'NoFluen',
          'Relleno']]

# Extract common dates
common_dates = selected_data.index.intersection(df_liwc.index)
common_dates_str = common_dates.strftime('%Y-%m-%d')

# Filter X and y using the extracted date
X = df_liwc.loc[common_dates]
y = selected_data['EMBI'].loc[common_dates]

In [95]:

y = pd.to_numeric(y, errors='coerce')

for key in X:
    X[key] = pd.to_numeric(X[key], errors='coerce')
X = X.dropna()
y = y.dropna()
# y = y[X.index]  # Update y to match the new X
# Extract common dates
common_dates = selected_data.index.intersection(df_liwc.index)
common_dates_str = common_dates.strftime('%Y-%m-%d')



# Extract common dates
common_dates = selected_data.index.intersection(X.index)
common_dates_str = common_dates.strftime('%Y-%m-%d')
# Filter X and y using the extracted date
X = X.loc[common_dates]
y = y.loc[common_dates]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[key] = pd.to_numeric(X[key], errors='coerce')


KeyError: "[Timestamp('2018-01-01 00:00:00'), Timestamp('2018-01-11 00:00:00'), Timestamp('2018-01-15 00:00:00'), Timestamp('2018-02-19 00:00:00'), Timestamp('2018-03-30 00:00:00'), Timestamp('2018-05-28 00:00:00'), Timestamp('2018-07-04 00:00:00'), Timestamp('2018-10-08 00:00:00'), Timestamp('2018-11-12 00:00:00'), Timestamp('2018-11-22 00:00:00'), Timestamp('2018-12-05 00:00:00'), Timestamp('2018-12-25 00:00:00'), Timestamp('2019-01-01 00:00:00'), Timestamp('2019-01-21 00:00:00'), Timestamp('2019-02-18 00:00:00'), Timestamp('2019-04-19 00:00:00'), Timestamp('2019-05-27 00:00:00'), Timestamp('2019-06-03 00:00:00'), Timestamp('2019-07-04 00:00:00'), Timestamp('2019-08-12 00:00:00'), Timestamp('2019-08-13 00:00:00'), Timestamp('2019-08-14 00:00:00'), Timestamp('2019-08-15 00:00:00'), Timestamp('2019-08-16 00:00:00'), Timestamp('2019-08-19 00:00:00'), Timestamp('2019-08-20 00:00:00'), Timestamp('2019-08-21 00:00:00'), Timestamp('2019-08-22 00:00:00'), Timestamp('2019-08-23 00:00:00'), Timestamp('2019-08-26 00:00:00'), Timestamp('2019-08-27 00:00:00'), Timestamp('2019-08-28 00:00:00'), Timestamp('2019-08-29 00:00:00'), Timestamp('2019-08-30 00:00:00'), Timestamp('2019-09-02 00:00:00'), Timestamp('2019-09-03 00:00:00'), Timestamp('2019-09-04 00:00:00'), Timestamp('2019-09-05 00:00:00'), Timestamp('2019-09-06 00:00:00'), Timestamp('2019-09-09 00:00:00'), Timestamp('2019-09-10 00:00:00'), Timestamp('2019-09-11 00:00:00'), Timestamp('2019-09-12 00:00:00'), Timestamp('2019-09-13 00:00:00'), Timestamp('2019-09-16 00:00:00'), Timestamp('2019-09-17 00:00:00'), Timestamp('2019-09-18 00:00:00'), Timestamp('2019-09-19 00:00:00'), Timestamp('2019-09-20 00:00:00'), Timestamp('2019-09-23 00:00:00'), Timestamp('2019-09-24 00:00:00'), Timestamp('2019-09-25 00:00:00'), Timestamp('2019-09-26 00:00:00'), Timestamp('2019-09-27 00:00:00'), Timestamp('2019-09-30 00:00:00'), Timestamp('2019-10-01 00:00:00'), Timestamp('2019-10-02 00:00:00'), Timestamp('2019-10-03 00:00:00'), Timestamp('2019-10-04 00:00:00'), Timestamp('2019-10-07 00:00:00'), Timestamp('2019-10-08 00:00:00'), Timestamp('2019-10-09 00:00:00'), Timestamp('2019-10-10 00:00:00'), Timestamp('2019-10-11 00:00:00'), Timestamp('2019-10-14 00:00:00'), Timestamp('2019-10-15 00:00:00'), Timestamp('2019-10-16 00:00:00'), Timestamp('2019-10-17 00:00:00'), Timestamp('2019-10-18 00:00:00'), Timestamp('2019-10-21 00:00:00'), Timestamp('2019-10-22 00:00:00'), Timestamp('2019-10-23 00:00:00'), Timestamp('2019-10-24 00:00:00'), Timestamp('2019-10-25 00:00:00'), Timestamp('2019-10-28 00:00:00'), Timestamp('2019-10-29 00:00:00'), Timestamp('2019-10-30 00:00:00'), Timestamp('2019-10-31 00:00:00'), Timestamp('2019-11-01 00:00:00'), Timestamp('2019-11-04 00:00:00'), Timestamp('2019-11-05 00:00:00'), Timestamp('2019-11-06 00:00:00'), Timestamp('2019-11-07 00:00:00'), Timestamp('2019-11-08 00:00:00'), Timestamp('2019-11-11 00:00:00'), Timestamp('2019-11-12 00:00:00'), Timestamp('2019-11-13 00:00:00'), Timestamp('2019-11-14 00:00:00'), Timestamp('2019-11-15 00:00:00'), Timestamp('2019-11-18 00:00:00'), Timestamp('2019-11-19 00:00:00'), Timestamp('2019-11-20 00:00:00'), Timestamp('2019-11-21 00:00:00'), Timestamp('2019-11-22 00:00:00'), Timestamp('2019-11-25 00:00:00'), Timestamp('2019-11-26 00:00:00'), Timestamp('2019-11-27 00:00:00'), Timestamp('2019-11-28 00:00:00'), Timestamp('2019-11-29 00:00:00'), Timestamp('2019-12-02 00:00:00'), Timestamp('2019-12-03 00:00:00'), Timestamp('2019-12-04 00:00:00'), Timestamp('2019-12-05 00:00:00'), Timestamp('2019-12-06 00:00:00'), Timestamp('2019-12-09 00:00:00'), Timestamp('2019-12-10 00:00:00'), Timestamp('2019-12-11 00:00:00'), Timestamp('2019-12-12 00:00:00'), Timestamp('2019-12-13 00:00:00'), Timestamp('2019-12-16 00:00:00'), Timestamp('2019-12-17 00:00:00'), Timestamp('2019-12-18 00:00:00'), Timestamp('2019-12-19 00:00:00'), Timestamp('2019-12-20 00:00:00'), Timestamp('2019-12-23 00:00:00'), Timestamp('2019-12-24 00:00:00'), Timestamp('2019-12-25 00:00:00'), Timestamp('2019-12-26 00:00:00'), Timestamp('2019-12-27 00:00:00'), Timestamp('2019-12-30 00:00:00'), Timestamp('2019-12-31 00:00:00'), Timestamp('2020-01-01 00:00:00'), Timestamp('2020-01-02 00:00:00'), Timestamp('2020-01-03 00:00:00'), Timestamp('2020-01-06 00:00:00'), Timestamp('2020-01-07 00:00:00'), Timestamp('2020-01-08 00:00:00'), Timestamp('2020-01-09 00:00:00'), Timestamp('2020-01-10 00:00:00'), Timestamp('2020-01-13 00:00:00'), Timestamp('2020-01-14 00:00:00'), Timestamp('2020-01-15 00:00:00'), Timestamp('2020-01-16 00:00:00'), Timestamp('2020-01-17 00:00:00'), Timestamp('2020-01-20 00:00:00'), Timestamp('2020-01-21 00:00:00'), Timestamp('2020-01-22 00:00:00'), Timestamp('2020-01-23 00:00:00'), Timestamp('2020-01-24 00:00:00'), Timestamp('2020-01-27 00:00:00'), Timestamp('2020-01-28 00:00:00'), Timestamp('2020-01-29 00:00:00'), Timestamp('2020-01-30 00:00:00'), Timestamp('2020-01-31 00:00:00'), Timestamp('2020-02-03 00:00:00'), Timestamp('2020-02-04 00:00:00'), Timestamp('2020-02-05 00:00:00'), Timestamp('2020-02-06 00:00:00'), Timestamp('2020-02-07 00:00:00'), Timestamp('2020-02-10 00:00:00'), Timestamp('2020-02-11 00:00:00'), Timestamp('2020-02-12 00:00:00'), Timestamp('2020-02-13 00:00:00'), Timestamp('2020-02-14 00:00:00'), Timestamp('2020-02-17 00:00:00'), Timestamp('2020-02-18 00:00:00'), Timestamp('2020-02-19 00:00:00'), Timestamp('2020-02-20 00:00:00'), Timestamp('2020-02-21 00:00:00'), Timestamp('2020-02-24 00:00:00'), Timestamp('2020-02-25 00:00:00'), Timestamp('2020-02-26 00:00:00'), Timestamp('2020-02-27 00:00:00'), Timestamp('2020-02-28 00:00:00'), Timestamp('2020-03-02 00:00:00'), Timestamp('2020-03-03 00:00:00'), Timestamp('2020-03-04 00:00:00'), Timestamp('2020-03-05 00:00:00'), Timestamp('2020-03-06 00:00:00'), Timestamp('2020-03-09 00:00:00'), Timestamp('2020-03-10 00:00:00'), Timestamp('2020-03-11 00:00:00'), Timestamp('2020-03-12 00:00:00'), Timestamp('2020-03-13 00:00:00'), Timestamp('2020-03-16 00:00:00'), Timestamp('2020-03-17 00:00:00'), Timestamp('2020-03-18 00:00:00'), Timestamp('2020-03-19 00:00:00'), Timestamp('2020-03-20 00:00:00'), Timestamp('2020-03-23 00:00:00'), Timestamp('2020-03-24 00:00:00'), Timestamp('2020-03-25 00:00:00'), Timestamp('2020-03-26 00:00:00'), Timestamp('2020-03-27 00:00:00'), Timestamp('2020-03-30 00:00:00'), Timestamp('2020-03-31 00:00:00'), Timestamp('2020-04-01 00:00:00'), Timestamp('2020-04-02 00:00:00'), Timestamp('2020-04-03 00:00:00'), Timestamp('2020-04-06 00:00:00'), Timestamp('2020-04-07 00:00:00'), Timestamp('2020-04-08 00:00:00'), Timestamp('2020-04-09 00:00:00'), Timestamp('2020-04-10 00:00:00'), Timestamp('2020-04-13 00:00:00'), Timestamp('2020-04-14 00:00:00'), Timestamp('2020-04-15 00:00:00'), Timestamp('2020-04-16 00:00:00'), Timestamp('2020-04-17 00:00:00'), Timestamp('2020-04-20 00:00:00'), Timestamp('2020-04-21 00:00:00'), Timestamp('2020-04-22 00:00:00'), Timestamp('2020-04-23 00:00:00'), Timestamp('2020-04-24 00:00:00'), Timestamp('2020-04-27 00:00:00'), Timestamp('2020-04-28 00:00:00'), Timestamp('2020-04-29 00:00:00'), Timestamp('2020-04-30 00:00:00'), Timestamp('2020-05-01 00:00:00'), Timestamp('2020-05-04 00:00:00'), Timestamp('2020-05-05 00:00:00'), Timestamp('2020-05-06 00:00:00'), Timestamp('2020-05-07 00:00:00'), Timestamp('2020-05-08 00:00:00'), Timestamp('2020-05-11 00:00:00'), Timestamp('2020-05-12 00:00:00'), Timestamp('2020-05-13 00:00:00'), Timestamp('2020-05-14 00:00:00'), Timestamp('2020-05-15 00:00:00'), Timestamp('2020-05-18 00:00:00'), Timestamp('2020-05-19 00:00:00'), Timestamp('2020-05-20 00:00:00'), Timestamp('2020-05-21 00:00:00'), Timestamp('2020-05-22 00:00:00'), Timestamp('2020-05-25 00:00:00'), Timestamp('2020-05-26 00:00:00'), Timestamp('2020-05-27 00:00:00'), Timestamp('2020-05-28 00:00:00'), Timestamp('2020-05-29 00:00:00'), Timestamp('2020-06-01 00:00:00'), Timestamp('2020-06-02 00:00:00'), Timestamp('2020-06-03 00:00:00'), Timestamp('2020-06-04 00:00:00'), Timestamp('2020-06-05 00:00:00'), Timestamp('2020-06-08 00:00:00'), Timestamp('2020-06-09 00:00:00'), Timestamp('2020-06-10 00:00:00'), Timestamp('2020-06-11 00:00:00'), Timestamp('2020-06-12 00:00:00'), Timestamp('2020-06-15 00:00:00'), Timestamp('2020-06-16 00:00:00'), Timestamp('2020-06-17 00:00:00'), Timestamp('2020-06-18 00:00:00'), Timestamp('2020-06-19 00:00:00'), Timestamp('2020-06-22 00:00:00'), Timestamp('2020-06-23 00:00:00'), Timestamp('2020-06-24 00:00:00'), Timestamp('2020-06-25 00:00:00'), Timestamp('2020-06-26 00:00:00'), Timestamp('2020-06-29 00:00:00'), Timestamp('2020-06-30 00:00:00'), Timestamp('2020-07-01 00:00:00'), Timestamp('2020-07-02 00:00:00'), Timestamp('2020-07-03 00:00:00'), Timestamp('2020-07-06 00:00:00'), Timestamp('2020-07-07 00:00:00'), Timestamp('2020-07-08 00:00:00'), Timestamp('2020-07-09 00:00:00'), Timestamp('2020-07-10 00:00:00'), Timestamp('2020-07-13 00:00:00'), Timestamp('2020-07-14 00:00:00'), Timestamp('2020-07-15 00:00:00'), Timestamp('2020-07-16 00:00:00'), Timestamp('2020-07-17 00:00:00'), Timestamp('2020-07-20 00:00:00'), Timestamp('2020-07-21 00:00:00'), Timestamp('2020-07-22 00:00:00'), Timestamp('2020-07-23 00:00:00'), Timestamp('2020-07-24 00:00:00'), Timestamp('2020-07-27 00:00:00'), Timestamp('2020-07-28 00:00:00'), Timestamp('2020-07-29 00:00:00'), Timestamp('2020-07-30 00:00:00'), Timestamp('2020-07-31 00:00:00'), Timestamp('2020-08-03 00:00:00'), Timestamp('2020-08-04 00:00:00'), Timestamp('2020-08-05 00:00:00'), Timestamp('2020-08-06 00:00:00'), Timestamp('2020-08-07 00:00:00'), Timestamp('2020-08-10 00:00:00'), Timestamp('2020-08-11 00:00:00'), Timestamp('2020-08-12 00:00:00'), Timestamp('2020-08-13 00:00:00'), Timestamp('2020-08-14 00:00:00'), Timestamp('2020-08-17 00:00:00'), Timestamp('2020-08-18 00:00:00'), Timestamp('2020-08-19 00:00:00'), Timestamp('2020-08-20 00:00:00'), Timestamp('2020-08-21 00:00:00'), Timestamp('2020-08-24 00:00:00'), Timestamp('2020-08-25 00:00:00'), Timestamp('2020-08-26 00:00:00'), Timestamp('2020-08-27 00:00:00'), Timestamp('2020-08-28 00:00:00'), Timestamp('2020-08-31 00:00:00'), Timestamp('2020-09-01 00:00:00'), Timestamp('2020-09-02 00:00:00'), Timestamp('2020-09-03 00:00:00'), Timestamp('2020-09-04 00:00:00'), Timestamp('2020-09-07 00:00:00'), Timestamp('2020-09-08 00:00:00'), Timestamp('2020-09-09 00:00:00'), Timestamp('2020-09-10 00:00:00'), Timestamp('2020-09-11 00:00:00'), Timestamp('2020-09-14 00:00:00'), Timestamp('2020-09-15 00:00:00'), Timestamp('2020-09-16 00:00:00'), Timestamp('2020-09-17 00:00:00'), Timestamp('2020-09-18 00:00:00'), Timestamp('2020-09-21 00:00:00'), Timestamp('2020-09-22 00:00:00'), Timestamp('2020-09-23 00:00:00'), Timestamp('2020-09-24 00:00:00'), Timestamp('2020-09-25 00:00:00'), Timestamp('2020-09-28 00:00:00'), Timestamp('2020-09-29 00:00:00'), Timestamp('2020-09-30 00:00:00'), Timestamp('2020-10-01 00:00:00'), Timestamp('2020-10-02 00:00:00'), Timestamp('2020-10-05 00:00:00'), Timestamp('2020-10-06 00:00:00'), Timestamp('2020-10-07 00:00:00'), Timestamp('2020-10-08 00:00:00'), Timestamp('2020-10-09 00:00:00'), Timestamp('2020-10-12 00:00:00'), Timestamp('2020-10-13 00:00:00'), Timestamp('2020-10-14 00:00:00'), Timestamp('2020-10-15 00:00:00'), Timestamp('2020-10-16 00:00:00'), Timestamp('2020-10-19 00:00:00'), Timestamp('2020-10-20 00:00:00'), Timestamp('2020-10-21 00:00:00'), Timestamp('2020-10-22 00:00:00'), Timestamp('2020-10-23 00:00:00'), Timestamp('2020-10-26 00:00:00'), Timestamp('2020-10-27 00:00:00'), Timestamp('2020-10-28 00:00:00'), Timestamp('2020-10-29 00:00:00'), Timestamp('2020-10-30 00:00:00'), Timestamp('2020-11-02 00:00:00'), Timestamp('2020-11-03 00:00:00'), Timestamp('2020-11-04 00:00:00'), Timestamp('2020-11-05 00:00:00'), Timestamp('2020-11-06 00:00:00'), Timestamp('2020-11-09 00:00:00'), Timestamp('2020-11-10 00:00:00'), Timestamp('2020-11-11 00:00:00'), Timestamp('2020-11-12 00:00:00'), Timestamp('2020-11-13 00:00:00'), Timestamp('2020-11-16 00:00:00'), Timestamp('2020-11-17 00:00:00'), Timestamp('2020-11-18 00:00:00'), Timestamp('2020-11-19 00:00:00'), Timestamp('2020-11-20 00:00:00'), Timestamp('2020-11-23 00:00:00'), Timestamp('2020-11-24 00:00:00'), Timestamp('2020-11-25 00:00:00'), Timestamp('2020-11-26 00:00:00'), Timestamp('2020-11-27 00:00:00'), Timestamp('2020-11-30 00:00:00'), Timestamp('2020-12-01 00:00:00'), Timestamp('2020-12-02 00:00:00'), Timestamp('2020-12-03 00:00:00'), Timestamp('2020-12-04 00:00:00'), Timestamp('2020-12-07 00:00:00'), Timestamp('2020-12-08 00:00:00'), Timestamp('2020-12-09 00:00:00'), Timestamp('2020-12-10 00:00:00'), Timestamp('2020-12-11 00:00:00'), Timestamp('2020-12-14 00:00:00'), Timestamp('2020-12-15 00:00:00'), Timestamp('2020-12-16 00:00:00'), Timestamp('2020-12-17 00:00:00'), Timestamp('2020-12-18 00:00:00'), Timestamp('2020-12-21 00:00:00'), Timestamp('2020-12-22 00:00:00'), Timestamp('2020-12-23 00:00:00'), Timestamp('2020-12-24 00:00:00'), Timestamp('2020-12-25 00:00:00'), Timestamp('2020-12-28 00:00:00'), Timestamp('2020-12-29 00:00:00'), Timestamp('2020-12-30 00:00:00'), Timestamp('2020-12-31 00:00:00'), Timestamp('2021-01-01 00:00:00'), Timestamp('2021-01-04 00:00:00'), Timestamp('2021-01-05 00:00:00'), Timestamp('2021-01-06 00:00:00'), Timestamp('2021-01-07 00:00:00'), Timestamp('2021-01-08 00:00:00'), Timestamp('2021-01-11 00:00:00'), Timestamp('2021-01-12 00:00:00'), Timestamp('2021-01-13 00:00:00'), Timestamp('2021-01-14 00:00:00'), Timestamp('2021-01-15 00:00:00'), Timestamp('2021-01-18 00:00:00'), Timestamp('2021-01-19 00:00:00'), Timestamp('2021-01-20 00:00:00'), Timestamp('2021-01-21 00:00:00'), Timestamp('2021-01-22 00:00:00'), Timestamp('2021-01-25 00:00:00'), Timestamp('2021-01-26 00:00:00'), Timestamp('2021-01-27 00:00:00'), Timestamp('2021-01-28 00:00:00'), Timestamp('2021-01-29 00:00:00'), Timestamp('2021-02-01 00:00:00'), Timestamp('2021-02-02 00:00:00'), Timestamp('2021-02-03 00:00:00'), Timestamp('2021-02-04 00:00:00'), Timestamp('2021-02-05 00:00:00'), Timestamp('2021-02-08 00:00:00'), Timestamp('2021-02-09 00:00:00'), Timestamp('2021-02-10 00:00:00'), Timestamp('2021-02-11 00:00:00'), Timestamp('2021-02-12 00:00:00'), Timestamp('2021-02-15 00:00:00'), Timestamp('2021-02-16 00:00:00'), Timestamp('2021-02-17 00:00:00'), Timestamp('2021-02-18 00:00:00'), Timestamp('2021-02-19 00:00:00'), Timestamp('2021-02-22 00:00:00'), Timestamp('2021-02-23 00:00:00'), Timestamp('2021-02-24 00:00:00'), Timestamp('2021-02-25 00:00:00'), Timestamp('2021-02-26 00:00:00'), Timestamp('2021-03-01 00:00:00'), Timestamp('2021-03-02 00:00:00'), Timestamp('2021-03-03 00:00:00'), Timestamp('2021-03-04 00:00:00'), Timestamp('2021-03-05 00:00:00'), Timestamp('2021-03-08 00:00:00'), Timestamp('2021-03-09 00:00:00'), Timestamp('2021-03-10 00:00:00'), Timestamp('2021-03-11 00:00:00'), Timestamp('2021-03-12 00:00:00'), Timestamp('2021-03-15 00:00:00'), Timestamp('2021-03-16 00:00:00'), Timestamp('2021-03-17 00:00:00'), Timestamp('2021-03-18 00:00:00'), Timestamp('2021-03-19 00:00:00'), Timestamp('2021-03-22 00:00:00'), Timestamp('2021-03-23 00:00:00'), Timestamp('2021-03-24 00:00:00'), Timestamp('2021-03-25 00:00:00'), Timestamp('2021-03-26 00:00:00'), Timestamp('2021-03-29 00:00:00'), Timestamp('2021-03-30 00:00:00'), Timestamp('2021-03-31 00:00:00'), Timestamp('2021-04-01 00:00:00'), Timestamp('2021-04-02 00:00:00'), Timestamp('2021-04-05 00:00:00'), Timestamp('2021-04-06 00:00:00'), Timestamp('2021-04-07 00:00:00'), Timestamp('2021-04-08 00:00:00'), Timestamp('2021-04-09 00:00:00'), Timestamp('2021-04-12 00:00:00'), Timestamp('2021-04-13 00:00:00'), Timestamp('2021-04-14 00:00:00'), Timestamp('2021-04-15 00:00:00'), Timestamp('2021-04-16 00:00:00'), Timestamp('2021-04-19 00:00:00'), Timestamp('2021-04-20 00:00:00'), Timestamp('2021-04-21 00:00:00'), Timestamp('2021-04-22 00:00:00'), Timestamp('2021-04-23 00:00:00'), Timestamp('2021-04-26 00:00:00'), Timestamp('2021-04-27 00:00:00'), Timestamp('2021-04-28 00:00:00'), Timestamp('2021-04-29 00:00:00'), Timestamp('2021-04-30 00:00:00'), Timestamp('2021-05-03 00:00:00'), Timestamp('2021-05-04 00:00:00'), Timestamp('2021-05-05 00:00:00'), Timestamp('2021-05-06 00:00:00'), Timestamp('2021-05-07 00:00:00'), Timestamp('2021-05-10 00:00:00'), Timestamp('2021-05-11 00:00:00'), Timestamp('2021-05-12 00:00:00'), Timestamp('2021-05-13 00:00:00'), Timestamp('2021-05-14 00:00:00'), Timestamp('2021-05-17 00:00:00'), Timestamp('2021-05-18 00:00:00'), Timestamp('2021-05-19 00:00:00'), Timestamp('2021-05-20 00:00:00'), Timestamp('2021-05-21 00:00:00'), Timestamp('2021-05-24 00:00:00'), Timestamp('2021-05-25 00:00:00'), Timestamp('2021-05-26 00:00:00'), Timestamp('2021-05-27 00:00:00'), Timestamp('2021-05-28 00:00:00'), Timestamp('2021-05-31 00:00:00'), Timestamp('2021-06-01 00:00:00'), Timestamp('2021-06-02 00:00:00'), Timestamp('2021-06-03 00:00:00'), Timestamp('2021-06-04 00:00:00'), Timestamp('2021-06-07 00:00:00'), Timestamp('2021-06-08 00:00:00'), Timestamp('2021-06-09 00:00:00'), Timestamp('2021-06-10 00:00:00'), Timestamp('2021-06-11 00:00:00'), Timestamp('2021-06-14 00:00:00'), Timestamp('2021-06-15 00:00:00'), Timestamp('2021-06-16 00:00:00'), Timestamp('2021-06-17 00:00:00'), Timestamp('2021-06-18 00:00:00'), Timestamp('2021-06-21 00:00:00'), Timestamp('2021-06-22 00:00:00'), Timestamp('2021-06-23 00:00:00'), Timestamp('2021-06-24 00:00:00'), Timestamp('2021-06-25 00:00:00'), Timestamp('2021-06-28 00:00:00'), Timestamp('2021-06-29 00:00:00'), Timestamp('2021-06-30 00:00:00'), Timestamp('2021-07-01 00:00:00'), Timestamp('2021-07-02 00:00:00'), Timestamp('2021-07-05 00:00:00'), Timestamp('2021-07-06 00:00:00'), Timestamp('2021-07-07 00:00:00'), Timestamp('2021-07-08 00:00:00'), Timestamp('2021-07-09 00:00:00'), Timestamp('2021-07-12 00:00:00'), Timestamp('2021-07-13 00:00:00'), Timestamp('2021-07-14 00:00:00'), Timestamp('2021-07-15 00:00:00'), Timestamp('2021-07-16 00:00:00'), Timestamp('2021-07-19 00:00:00'), Timestamp('2021-07-20 00:00:00'), Timestamp('2021-07-21 00:00:00'), Timestamp('2021-07-22 00:00:00'), Timestamp('2021-07-23 00:00:00'), Timestamp('2021-07-26 00:00:00'), Timestamp('2021-07-27 00:00:00'), Timestamp('2021-07-28 00:00:00'), Timestamp('2021-07-29 00:00:00'), Timestamp('2021-07-30 00:00:00'), Timestamp('2021-08-02 00:00:00'), Timestamp('2021-08-03 00:00:00'), Timestamp('2021-08-04 00:00:00'), Timestamp('2021-08-05 00:00:00'), Timestamp('2021-08-06 00:00:00'), Timestamp('2021-08-09 00:00:00'), Timestamp('2021-08-10 00:00:00'), Timestamp('2021-08-11 00:00:00'), Timestamp('2021-08-12 00:00:00'), Timestamp('2021-08-13 00:00:00'), Timestamp('2021-08-16 00:00:00'), Timestamp('2021-08-17 00:00:00'), Timestamp('2021-08-18 00:00:00'), Timestamp('2021-08-19 00:00:00'), Timestamp('2021-08-20 00:00:00'), Timestamp('2021-08-23 00:00:00'), Timestamp('2021-08-24 00:00:00'), Timestamp('2021-08-25 00:00:00'), Timestamp('2021-08-26 00:00:00'), Timestamp('2021-08-27 00:00:00'), Timestamp('2021-08-30 00:00:00'), Timestamp('2021-08-31 00:00:00'), Timestamp('2021-09-01 00:00:00'), Timestamp('2021-09-02 00:00:00'), Timestamp('2021-09-03 00:00:00'), Timestamp('2021-09-06 00:00:00'), Timestamp('2021-09-07 00:00:00'), Timestamp('2021-09-08 00:00:00'), Timestamp('2021-09-09 00:00:00'), Timestamp('2021-09-10 00:00:00'), Timestamp('2021-09-13 00:00:00'), Timestamp('2021-09-14 00:00:00'), Timestamp('2021-09-15 00:00:00'), Timestamp('2021-09-16 00:00:00'), Timestamp('2021-09-17 00:00:00'), Timestamp('2021-09-20 00:00:00'), Timestamp('2021-09-21 00:00:00'), Timestamp('2021-09-22 00:00:00'), Timestamp('2021-09-23 00:00:00'), Timestamp('2021-09-24 00:00:00'), Timestamp('2021-09-27 00:00:00'), Timestamp('2021-09-28 00:00:00'), Timestamp('2021-09-29 00:00:00'), Timestamp('2021-09-30 00:00:00'), Timestamp('2021-10-01 00:00:00'), Timestamp('2021-10-04 00:00:00'), Timestamp('2021-10-05 00:00:00'), Timestamp('2021-10-06 00:00:00'), Timestamp('2021-10-07 00:00:00'), Timestamp('2021-10-08 00:00:00'), Timestamp('2021-10-11 00:00:00'), Timestamp('2021-10-12 00:00:00'), Timestamp('2021-10-13 00:00:00'), Timestamp('2021-10-14 00:00:00'), Timestamp('2021-10-15 00:00:00'), Timestamp('2021-10-18 00:00:00'), Timestamp('2021-10-19 00:00:00'), Timestamp('2021-10-20 00:00:00'), Timestamp('2021-10-21 00:00:00'), Timestamp('2021-10-22 00:00:00'), Timestamp('2021-10-25 00:00:00'), Timestamp('2021-10-26 00:00:00'), Timestamp('2021-10-27 00:00:00'), Timestamp('2021-10-28 00:00:00'), Timestamp('2021-10-29 00:00:00'), Timestamp('2021-11-01 00:00:00'), Timestamp('2021-11-02 00:00:00'), Timestamp('2021-11-03 00:00:00'), Timestamp('2021-11-04 00:00:00'), Timestamp('2021-11-05 00:00:00'), Timestamp('2021-11-08 00:00:00'), Timestamp('2021-11-09 00:00:00'), Timestamp('2021-11-10 00:00:00'), Timestamp('2021-11-11 00:00:00'), Timestamp('2021-11-12 00:00:00'), Timestamp('2021-11-15 00:00:00'), Timestamp('2021-11-16 00:00:00'), Timestamp('2021-11-17 00:00:00'), Timestamp('2021-11-18 00:00:00'), Timestamp('2021-11-19 00:00:00'), Timestamp('2021-11-22 00:00:00'), Timestamp('2021-11-23 00:00:00'), Timestamp('2021-11-24 00:00:00'), Timestamp('2021-11-25 00:00:00'), Timestamp('2021-11-26 00:00:00'), Timestamp('2021-11-29 00:00:00'), Timestamp('2021-11-30 00:00:00'), Timestamp('2021-12-01 00:00:00'), Timestamp('2021-12-02 00:00:00'), Timestamp('2021-12-03 00:00:00'), Timestamp('2021-12-06 00:00:00'), Timestamp('2021-12-07 00:00:00'), Timestamp('2021-12-08 00:00:00'), Timestamp('2021-12-09 00:00:00'), Timestamp('2021-12-10 00:00:00'), Timestamp('2021-12-13 00:00:00'), Timestamp('2021-12-14 00:00:00'), Timestamp('2021-12-15 00:00:00'), Timestamp('2021-12-16 00:00:00'), Timestamp('2021-12-17 00:00:00'), Timestamp('2021-12-20 00:00:00'), Timestamp('2021-12-21 00:00:00'), Timestamp('2021-12-22 00:00:00'), Timestamp('2021-12-23 00:00:00'), Timestamp('2021-12-24 00:00:00'), Timestamp('2021-12-27 00:00:00'), Timestamp('2021-12-28 00:00:00'), Timestamp('2021-12-29 00:00:00'), Timestamp('2021-12-30 00:00:00'), Timestamp('2021-12-31 00:00:00'), Timestamp('2022-01-03 00:00:00'), Timestamp('2022-01-04 00:00:00'), Timestamp('2022-01-05 00:00:00'), Timestamp('2022-01-06 00:00:00'), Timestamp('2022-01-07 00:00:00'), Timestamp('2022-01-10 00:00:00'), Timestamp('2022-01-11 00:00:00'), Timestamp('2022-01-12 00:00:00'), Timestamp('2022-01-13 00:00:00'), Timestamp('2022-01-14 00:00:00'), Timestamp('2022-01-17 00:00:00'), Timestamp('2022-01-18 00:00:00'), Timestamp('2022-01-19 00:00:00'), Timestamp('2022-01-20 00:00:00'), Timestamp('2022-01-21 00:00:00'), Timestamp('2022-01-24 00:00:00'), Timestamp('2022-01-25 00:00:00'), Timestamp('2022-01-26 00:00:00'), Timestamp('2022-01-27 00:00:00'), Timestamp('2022-01-28 00:00:00'), Timestamp('2022-01-31 00:00:00'), Timestamp('2022-02-01 00:00:00'), Timestamp('2022-02-02 00:00:00'), Timestamp('2022-02-03 00:00:00'), Timestamp('2022-02-04 00:00:00'), Timestamp('2022-02-07 00:00:00'), Timestamp('2022-02-08 00:00:00'), Timestamp('2022-02-09 00:00:00'), Timestamp('2022-02-10 00:00:00'), Timestamp('2022-02-11 00:00:00'), Timestamp('2022-02-14 00:00:00'), Timestamp('2022-02-15 00:00:00'), Timestamp('2022-02-16 00:00:00'), Timestamp('2022-02-17 00:00:00'), Timestamp('2022-02-18 00:00:00'), Timestamp('2022-02-21 00:00:00'), Timestamp('2022-02-22 00:00:00'), Timestamp('2022-02-23 00:00:00'), Timestamp('2022-02-24 00:00:00'), Timestamp('2022-02-25 00:00:00'), Timestamp('2022-02-28 00:00:00'), Timestamp('2022-03-01 00:00:00'), Timestamp('2022-03-02 00:00:00'), Timestamp('2022-03-03 00:00:00'), Timestamp('2022-03-04 00:00:00'), Timestamp('2022-03-07 00:00:00'), Timestamp('2022-03-08 00:00:00'), Timestamp('2022-03-09 00:00:00'), Timestamp('2022-03-10 00:00:00'), Timestamp('2022-03-11 00:00:00'), Timestamp('2022-03-14 00:00:00'), Timestamp('2022-03-15 00:00:00'), Timestamp('2022-03-16 00:00:00'), Timestamp('2022-03-17 00:00:00'), Timestamp('2022-03-18 00:00:00'), Timestamp('2022-03-21 00:00:00'), Timestamp('2022-03-22 00:00:00'), Timestamp('2022-03-23 00:00:00'), Timestamp('2022-03-24 00:00:00'), Timestamp('2022-03-25 00:00:00'), Timestamp('2022-03-28 00:00:00'), Timestamp('2022-03-29 00:00:00'), Timestamp('2022-03-30 00:00:00'), Timestamp('2022-03-31 00:00:00'), Timestamp('2022-04-01 00:00:00'), Timestamp('2022-04-04 00:00:00'), Timestamp('2022-04-05 00:00:00'), Timestamp('2022-04-06 00:00:00'), Timestamp('2022-04-07 00:00:00'), Timestamp('2022-04-08 00:00:00'), Timestamp('2022-04-11 00:00:00'), Timestamp('2022-04-12 00:00:00'), Timestamp('2022-04-13 00:00:00'), Timestamp('2022-04-14 00:00:00'), Timestamp('2022-04-15 00:00:00'), Timestamp('2022-04-18 00:00:00'), Timestamp('2022-04-19 00:00:00'), Timestamp('2022-04-20 00:00:00'), Timestamp('2022-04-21 00:00:00'), Timestamp('2022-04-22 00:00:00'), Timestamp('2022-04-25 00:00:00'), Timestamp('2022-04-26 00:00:00'), Timestamp('2022-04-27 00:00:00'), Timestamp('2022-04-28 00:00:00'), Timestamp('2022-04-29 00:00:00'), Timestamp('2022-05-02 00:00:00'), Timestamp('2022-05-03 00:00:00'), Timestamp('2022-05-04 00:00:00'), Timestamp('2022-05-05 00:00:00'), Timestamp('2022-05-06 00:00:00'), Timestamp('2022-05-09 00:00:00'), Timestamp('2022-05-10 00:00:00'), Timestamp('2022-05-11 00:00:00'), Timestamp('2022-05-12 00:00:00'), Timestamp('2022-05-13 00:00:00'), Timestamp('2022-05-16 00:00:00'), Timestamp('2022-05-17 00:00:00'), Timestamp('2022-05-18 00:00:00'), Timestamp('2022-05-19 00:00:00'), Timestamp('2022-05-20 00:00:00'), Timestamp('2022-05-23 00:00:00'), Timestamp('2022-05-24 00:00:00'), Timestamp('2022-05-25 00:00:00'), Timestamp('2022-05-26 00:00:00'), Timestamp('2022-05-27 00:00:00'), Timestamp('2022-05-30 00:00:00'), Timestamp('2022-05-31 00:00:00'), Timestamp('2022-06-01 00:00:00'), Timestamp('2022-06-02 00:00:00'), Timestamp('2022-06-03 00:00:00'), Timestamp('2022-06-06 00:00:00'), Timestamp('2022-06-07 00:00:00'), Timestamp('2022-06-08 00:00:00'), Timestamp('2022-06-09 00:00:00'), Timestamp('2022-06-10 00:00:00'), Timestamp('2022-06-13 00:00:00'), Timestamp('2022-06-14 00:00:00'), Timestamp('2022-06-15 00:00:00'), Timestamp('2022-06-16 00:00:00'), Timestamp('2022-06-17 00:00:00'), Timestamp('2022-06-20 00:00:00'), Timestamp('2022-06-21 00:00:00'), Timestamp('2022-06-22 00:00:00'), Timestamp('2022-06-23 00:00:00'), Timestamp('2022-06-24 00:00:00'), Timestamp('2022-06-27 00:00:00'), Timestamp('2022-06-28 00:00:00'), Timestamp('2022-06-29 00:00:00'), Timestamp('2022-06-30 00:00:00'), Timestamp('2022-07-01 00:00:00'), Timestamp('2022-07-04 00:00:00'), Timestamp('2022-07-05 00:00:00'), Timestamp('2022-07-06 00:00:00'), Timestamp('2022-07-07 00:00:00'), Timestamp('2022-07-08 00:00:00'), Timestamp('2022-07-11 00:00:00'), Timestamp('2022-07-12 00:00:00'), Timestamp('2022-07-13 00:00:00'), Timestamp('2022-07-14 00:00:00'), Timestamp('2022-07-15 00:00:00'), Timestamp('2022-07-18 00:00:00'), Timestamp('2022-07-19 00:00:00'), Timestamp('2022-07-20 00:00:00'), Timestamp('2022-07-21 00:00:00'), Timestamp('2022-07-22 00:00:00'), Timestamp('2022-07-25 00:00:00'), Timestamp('2022-07-26 00:00:00'), Timestamp('2022-07-27 00:00:00'), Timestamp('2022-07-28 00:00:00'), Timestamp('2022-07-29 00:00:00'), Timestamp('2022-08-01 00:00:00'), Timestamp('2022-08-02 00:00:00'), Timestamp('2022-08-03 00:00:00'), Timestamp('2022-08-04 00:00:00'), Timestamp('2022-08-05 00:00:00'), Timestamp('2022-08-08 00:00:00'), Timestamp('2022-08-09 00:00:00'), Timestamp('2022-08-10 00:00:00'), Timestamp('2022-08-11 00:00:00'), Timestamp('2022-08-12 00:00:00'), Timestamp('2022-08-15 00:00:00'), Timestamp('2022-08-16 00:00:00'), Timestamp('2022-08-17 00:00:00'), Timestamp('2022-08-18 00:00:00'), Timestamp('2022-08-19 00:00:00'), Timestamp('2022-08-22 00:00:00'), Timestamp('2022-08-23 00:00:00'), Timestamp('2022-08-24 00:00:00'), Timestamp('2022-08-25 00:00:00'), Timestamp('2022-08-26 00:00:00'), Timestamp('2022-08-29 00:00:00'), Timestamp('2022-08-30 00:00:00'), Timestamp('2022-08-31 00:00:00'), Timestamp('2022-09-01 00:00:00'), Timestamp('2022-09-02 00:00:00'), Timestamp('2022-09-05 00:00:00'), Timestamp('2022-09-06 00:00:00'), Timestamp('2022-09-07 00:00:00'), Timestamp('2022-09-08 00:00:00'), Timestamp('2022-09-09 00:00:00'), Timestamp('2022-09-12 00:00:00'), Timestamp('2022-09-13 00:00:00'), Timestamp('2022-09-14 00:00:00'), Timestamp('2022-09-15 00:00:00'), Timestamp('2022-09-16 00:00:00'), Timestamp('2022-09-19 00:00:00'), Timestamp('2022-09-20 00:00:00'), Timestamp('2022-09-21 00:00:00'), Timestamp('2022-09-22 00:00:00'), Timestamp('2022-09-23 00:00:00'), Timestamp('2022-09-26 00:00:00'), Timestamp('2022-09-27 00:00:00'), Timestamp('2022-09-28 00:00:00'), Timestamp('2022-09-29 00:00:00'), Timestamp('2022-09-30 00:00:00'), Timestamp('2022-10-03 00:00:00'), Timestamp('2022-10-04 00:00:00'), Timestamp('2022-10-05 00:00:00'), Timestamp('2022-10-06 00:00:00'), Timestamp('2022-10-07 00:00:00'), Timestamp('2022-10-10 00:00:00'), Timestamp('2022-10-11 00:00:00'), Timestamp('2022-10-12 00:00:00'), Timestamp('2022-10-13 00:00:00'), Timestamp('2022-10-14 00:00:00'), Timestamp('2022-10-17 00:00:00'), Timestamp('2022-10-18 00:00:00'), Timestamp('2022-10-19 00:00:00'), Timestamp('2022-10-20 00:00:00'), Timestamp('2022-10-21 00:00:00'), Timestamp('2022-10-24 00:00:00'), Timestamp('2022-10-25 00:00:00'), Timestamp('2022-10-26 00:00:00'), Timestamp('2022-10-27 00:00:00'), Timestamp('2022-10-28 00:00:00'), Timestamp('2022-10-31 00:00:00'), Timestamp('2022-11-01 00:00:00'), Timestamp('2022-11-02 00:00:00'), Timestamp('2022-11-03 00:00:00'), Timestamp('2022-11-04 00:00:00'), Timestamp('2022-11-07 00:00:00'), Timestamp('2022-11-08 00:00:00'), Timestamp('2022-11-09 00:00:00'), Timestamp('2022-11-10 00:00:00'), Timestamp('2022-11-11 00:00:00'), Timestamp('2022-11-14 00:00:00'), Timestamp('2022-11-15 00:00:00'), Timestamp('2022-11-16 00:00:00'), Timestamp('2022-11-17 00:00:00'), Timestamp('2022-11-18 00:00:00'), Timestamp('2022-11-21 00:00:00'), Timestamp('2022-11-22 00:00:00'), Timestamp('2022-11-23 00:00:00'), Timestamp('2022-11-24 00:00:00'), Timestamp('2022-11-25 00:00:00'), Timestamp('2022-11-28 00:00:00'), Timestamp('2022-11-29 00:00:00'), Timestamp('2022-11-30 00:00:00'), Timestamp('2022-12-01 00:00:00'), Timestamp('2022-12-02 00:00:00'), Timestamp('2022-12-05 00:00:00'), Timestamp('2022-12-06 00:00:00'), Timestamp('2022-12-07 00:00:00'), Timestamp('2022-12-08 00:00:00'), Timestamp('2022-12-09 00:00:00'), Timestamp('2022-12-12 00:00:00'), Timestamp('2022-12-13 00:00:00'), Timestamp('2022-12-14 00:00:00'), Timestamp('2022-12-15 00:00:00'), Timestamp('2022-12-16 00:00:00'), Timestamp('2022-12-19 00:00:00'), Timestamp('2022-12-20 00:00:00'), Timestamp('2022-12-21 00:00:00'), Timestamp('2022-12-22 00:00:00'), Timestamp('2022-12-23 00:00:00'), Timestamp('2022-12-26 00:00:00'), Timestamp('2022-12-27 00:00:00'), Timestamp('2022-12-28 00:00:00'), Timestamp('2022-12-29 00:00:00'), Timestamp('2022-12-30 00:00:00')] not in index"

In [94]:
display(y)
display(X)


date
2018-01-02    347.24
2018-01-03    355.10
2018-01-04    356.23
2018-01-05    362.22
2018-01-08    365.45
               ...  
2019-08-05    902.45
2019-08-06    895.73
2019-08-07    897.43
2019-08-08    895.06
2019-08-09    871.58
Name: EMBI, Length: 400, dtype: float64

Unnamed: 0_level_0,const,year,month,day,WC,Funct,TotPron,PronPer,Yo,Nosotro,...,Logro,Placer,Hogar,Dinero,Relig,Muerte,Asentir,NoFluen,Relleno,Obs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,1.0,2018.0,1.0,1.0,75.200000,32.133333,10.933333,5.800000,0.866667,0.133333,...,0.466667,0.333333,0.000000,0.533333,0.066667,0.000000,0.200000,0.000000,0.0,15
2018-01-02,1.0,2018.0,1.0,2.0,44.258621,20.517241,6.758621,3.534483,0.551724,0.068966,...,0.448276,0.155172,0.000000,0.137931,0.086207,0.068966,0.189655,0.000000,0.0,58
2018-01-03,1.0,2018.0,1.0,3.0,48.416667,21.236111,6.263889,3.569444,0.402778,0.263889,...,0.652778,0.625000,0.097222,0.250000,0.041667,0.069444,0.083333,0.000000,0.0,72
2018-01-04,1.0,2018.0,1.0,4.0,51.303571,21.607143,5.732143,3.285714,0.375000,0.071429,...,0.410714,0.500000,0.035714,0.232143,0.089286,0.017857,0.125000,0.000000,0.0,56
2018-01-05,1.0,2018.0,1.0,5.0,84.810811,35.770270,10.337838,5.148649,1.135135,0.081081,...,1.013514,0.243243,0.054054,0.324324,0.040541,0.067568,0.189189,0.000000,0.0,74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-26,1.0,2022.0,12.0,26.0,61.473684,27.842105,8.368421,4.947368,0.421053,0.105263,...,0.894737,0.315789,0.000000,0.526316,0.105263,0.052632,0.105263,0.105263,0.0,19
2022-12-27,1.0,2022.0,12.0,27.0,67.119048,29.261905,8.666667,5.238095,0.833333,0.142857,...,0.738095,0.595238,0.428571,0.642857,0.023810,0.119048,0.285714,0.023810,0.0,42
2022-12-28,1.0,2022.0,12.0,28.0,120.071429,27.428571,9.428571,5.071429,0.571429,0.000000,...,0.785714,0.857143,0.071429,1.142857,0.000000,0.214286,0.142857,0.000000,0.0,14
2022-12-29,1.0,2022.0,12.0,29.0,14.454545,5.272727,1.954545,1.000000,0.090909,0.045455,...,0.181818,0.045455,0.000000,0.136364,0.000000,0.000000,0.045455,0.000000,0.0,22


In [90]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
print(model.summary())
