In [185]:
import pandas as pd
import numpy as np
import ast
from ast import literal_eval

In [186]:
import os
import glob

def get_csv_filenames(folder_path):
    # Get a list of all CSV files in the folder
    file_pattern = os.path.join(folder_path, "*.csv")
    csv_files = glob.glob(file_pattern)
    
    # Extract filenames from the file paths
    csv_filenames = [os.path.basename(file_path) for file_path in csv_files]
    
    return csv_filenames

In [187]:
files = get_csv_filenames('Data')

dfs = pd.DataFrame()

for file in files:
    df = pd.read_csv(f'Data/{file}')
    dfs = pd.concat([dfs, df], axis=0)

In [188]:
dfs

Unnamed: 0.1,Unnamed: 0,Sentence,Sentence Encodings
0,0,Delta's ongoing flight cancellations have spar...,"{'ongoing flight': '0', 'have sparked': '0', ""..."
1,1,"Xi's statement continued, ""China attaches grea...","{'to maintain': '0', 'great importance': '0', ..."
2,2,"Harris’ critics, of course, don’t hesitate to ...","{'impact': '0', 'deep frustrations': '1', 'to ..."
3,3,"“Too many people denying, downplaying, rationa...","{'It': '0', 'October': '0', 'terrorizing': '1'..."
0,0,“I’m here in New Hampshire to talk about the b...,"{'New Hampshire': '0', 'here ’m': '0', 'to tal..."
...,...,...,...
6,6,"""(Biden) said early on, in his first administr...","{'his first administration': '0', 'He': '0', '..."
7,7,Castillo said it’s been a fight trying to get ...,"{'a fight': '1', 'canceled': '1', 'said': '0',..."
8,8,Trump's orbit seems furious about the fundrais...,"{'he': '0', 'Biden': '0', 'the fundraising': '..."
9,9,The Biden administration has launched an inves...,"{'to struggle': '1', 'the CrowdStrike failure'..."


In [189]:
dfs = dfs.drop(columns='Unnamed: 0')

In [190]:
dfs.reset_index(inplace=True)

In [191]:
dfs = dfs.drop(columns='index')

In [192]:
dfs['Sentence Encodings'] = dfs['Sentence Encodings'].apply(ast.literal_eval)

In [193]:
dfs.loc[0][1].keys()

dict_keys(['ongoing flight', 'have sparked', "Delta's ongoing flight cancellations", 'flight cancellations', 'a DOT investigation', "Delta's", 'ongoing', 'flight', 'cancellations', 'sparked', 'DOT', 'investigation.'])

In [194]:
def extract_phrases(row):
    new_keys = []
    for key in row.keys():
        new_keys.append(key)
    return new_keys

def extract_values(row):
    new_values = []
    for value in row.values():
        if value == '':
            new_values.append(0)
        elif int(value) == 0 or int(value) == 1:
            new_values.append(value)
        else:
            new_values.append(0)
    return new_values

In [195]:
dfs['Phrases'] = dfs['Sentence Encodings'].apply(lambda x: extract_phrases(x))
dfs['Values'] = dfs['Sentence Encodings'].apply(lambda x: extract_values(x))

In [196]:
dfs

Unnamed: 0,Sentence,Sentence Encodings,Phrases,Values
0,Delta's ongoing flight cancellations have spar...,"{'ongoing flight': '0', 'have sparked': '0', '...","[ongoing flight, have sparked, Delta's ongoing...","[0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0]"
1,"Xi's statement continued, ""China attaches grea...","{'to maintain': '0', 'great importance': '0', ...","[to maintain, great importance, it, the develo...","[0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, ..."
2,"Harris’ critics, of course, don’t hesitate to ...","{'impact': '0', 'deep frustrations': '1', 'to ...","[impact, deep frustrations, to point, see, her...","[0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, ..."
3,"“Too many people denying, downplaying, rationa...","{'It': '0', 'October': '0', 'terrorizing': '1'...","[It, October, terrorizing, horrors, denying, m...","[0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, ..."
4,“I’m here in New Hampshire to talk about the b...,"{'New Hampshire': '0', 'here ’m': '0', 'to tal...","[New Hampshire, here ’m, to talk, Biden, would...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
...,...,...,...,...
95,"""(Biden) said early on, in his first administr...","{'his first administration': '0', 'He': '0', '...","[his first administration, He, a new beginning...","[0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, ..."
96,Castillo said it’s been a fight trying to get ...,"{'a fight': '1', 'canceled': '1', 'said': '0',...","[a fight, canceled, said, vouchers, dealing, f...","[1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ..."
97,Trump's orbit seems furious about the fundrais...,"{'he': '0', 'Biden': '0', 'the fundraising': '...","[he, Biden, the fundraising, The former presid...","[0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, ..."
98,The Biden administration has launched an inves...,"{'to struggle': '1', 'the CrowdStrike failure'...","[to struggle, the CrowdStrike failure, the com...","[1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, ..."


In [197]:
dfs.to_csv('encoding_data.csv')