In [1]:
import pandas as pd

In [None]:
!pip install ace_tools

In [2]:
import pandas as pd

# Load the datasets
christmas_data = pd.read_csv('./Datasets/cleaned_christmas_data.csv', header=None)
sustainability_data = pd.read_csv('./Datasets/cleaned_sustainability_data.csv', header=None)

In [3]:
def create_combined_headers(data):
    # Assuming the first two rows are headers and the first column is also part of the headers
    header_rows = data.iloc[0:2].fillna('')

    # Combine the first two rows and the first column to create headers
    combined_headers = header_rows.apply(lambda x: ' '.join(x.astype(str)), axis=0)

    # Assign the combined headers to the dataframe
    data.columns = combined_headers

    # Remove the header rows from the data
    data = data.iloc[2:].reset_index(drop=True)

    return data

# Apply to both datasets
christmas_data = create_combined_headers(christmas_data)
sustainability_data = create_combined_headers(sustainability_data)


In [None]:
# Prepare the flattened dataset for ingestion
def flatten_dataset(data, id_column='Demographics'):
    id_vars = [col for col in data.columns if id_column in col]
    value_vars = [col for col in data.columns if col not in id_vars]
    flattened_data = data.melt(id_vars=id_vars, value_vars=value_vars, var_name='Question', value_name='Response')
    return flattened_data

# Flatten both datasets
christmas_flattened = flatten_dataset(christmas_data, 'Demographics')
sustainability_flattened = flatten_dataset(sustainability_data, 'Demographics')

In [None]:
christmas_flattened.head(50)

In [None]:
print(christmas_data.columns)

In [4]:
christmas_data.head()

Unnamed: 0,Demographics,Total Sample,Gender Male,Female,Unnamed: 5,Irish Region Dublin,Leinster,Munster,Connacht,Ulster (ROI Only),...,Twitter,Facebook,Instagram,General Online advertising,Email,Other,Unnamed: 18,Females,Females Dublin,No Segments
0,Gender,,,,,,,,,,...,,,,,,,,,,
1,Total,1002.0,490.0,512.0,,281.0,276.0,279.0,113.0,53.0,...,106.0,308.0,358.0,317.0,186.0,10.0,,184.0,144.0,726.0
2,Male,490.0,490.0,,,137.0,131.0,136.0,58.0,28.0,...,77.0,138.0,143.0,148.0,80.0,7.0,,0.0,0.0,490.0
3,Female,512.0,,512.0,,144.0,145.0,143.0,55.0,25.0,...,29.0,170.0,215.0,169.0,106.0,3.0,,184.0,144.0,236.0
4,Irish Region,,,,,,,,,,...,,,,,,,,,,


In [5]:
def remove_redundant_demographics(data):
    # Check if the 'Demographics' column exists
    if 'Demographics' in data.columns:
        # Drop any row where Demographics is "Demographics"
        data = data[data['Demographics'] != "Demographics"]
    else:
        print("'Demographics' column not found. Please check the dataset structure.")
    
    # Reset the index to avoid indexing issues
    data = data.reset_index(drop=True)
    return data

christmas_data = remove_redundant_demographics(christmas_data)
sustainability_data = remove_redundant_demographics(sustainability_data)


'Demographics' column not found. Please check the dataset structure.
'Demographics' column not found. Please check the dataset structure.


In [8]:
# Search for a similar column name if 'Demographics' does not exist
def find_demographics_column(data):
    for col in data.columns:
        if "demographics" in col.lower():
            return col
    return None

# Replace 'Demographics' with the actual column name found
demographics_col = find_demographics_column(christmas_data)
if demographics_col:
    christmas_data = christmas_data[christmas_data[demographics_col] != "Demographics"]
    christmas_data = christmas_data.reset_index(drop=True)
else:
    print("Demographics column not found in Christmas data.")

In [9]:
christmas_data.head()

Unnamed: 0,Demographics,Total Sample,Gender Male,Female,Unnamed: 5,Irish Region Dublin,Leinster,Munster,Connacht,Ulster (ROI Only),...,Twitter,Facebook,Instagram,General Online advertising,Email,Other,Unnamed: 18,Females,Females Dublin,No Segments
0,Gender,,,,,,,,,,...,,,,,,,,,,
1,Total,1002.0,490.0,512.0,,281.0,276.0,279.0,113.0,53.0,...,106.0,308.0,358.0,317.0,186.0,10.0,,184.0,144.0,726.0
2,Male,490.0,490.0,,,137.0,131.0,136.0,58.0,28.0,...,77.0,138.0,143.0,148.0,80.0,7.0,,0.0,0.0,490.0
3,Female,512.0,,512.0,,144.0,145.0,143.0,55.0,25.0,...,29.0,170.0,215.0,169.0,106.0,3.0,,184.0,144.0,236.0
4,Irish Region,,,,,,,,,,...,,,,,,,,,,


In [11]:
# Remove extra spaces from column names
christmas_data.columns = christmas_data.columns.str.strip()
sustainability_data.columns = sustainability_data.columns.str.strip()

In [14]:
christmas_data.head()

Unnamed: 0,Demographics,Total Sample,Gender Male,Female,Unnamed: 5,Irish Region Dublin,Leinster,Munster,Connacht,Ulster (ROI Only),...,Twitter,Facebook,Instagram,General Online advertising,Email,Other,Unnamed: 18,Females,Females Dublin,No Segments
0,Gender,,,,,,,,,,...,,,,,,,,,,
1,Total,1002.0,490.0,512.0,,281.0,276.0,279.0,113.0,53.0,...,106.0,308.0,358.0,317.0,186.0,10.0,,184.0,144.0,726.0
2,Male,490.0,490.0,,,137.0,131.0,136.0,58.0,28.0,...,77.0,138.0,143.0,148.0,80.0,7.0,,0.0,0.0,490.0
3,Female,512.0,,512.0,,144.0,145.0,143.0,55.0,25.0,...,29.0,170.0,215.0,169.0,106.0,3.0,,184.0,144.0,236.0
4,Irish Region,,,,,,,,,,...,,,,,,,,,,


In [12]:
# Prepare data for Azure Cognitive Search ingestion
def prepare_documents_structured(data):
    documents = []
    for i, row in data.iterrows():
        demographics = row['Demographics']
        questions_and_responses = {}

        for col in data.columns:
            if col != 'Demographics':  # Skip the demographic identifier itself
                question = col
                response = row[col]
                questions_and_responses[question] = response

        document = {
            "id": str(i),
            "Demographics": demographics,
            "QuestionsAndResponses": questions_and_responses
        }
        documents.append(document)
    return documents

# Prepare documents for both datasets
christmas_documents = prepare_documents_structured(christmas_data)
sustainability_documents = prepare_documents_structured(sustainability_data)

In [13]:
christmas_documents.head()

AttributeError: 'list' object has no attribute 'head'