In [4]:
import pandas as pd

# Read the datasets
tech_data = pd.read_csv("tech.csv")
stress_lysis_data = pd.read_csv("Stress_lysis.csv")

# Read `sleep.csv` with error handling for inconsistent rows
try:
    sleep_data = pd.read_csv("sleep.csv", on_bad_lines="skip")  # Skips problematic rows
except pd.errors.ParserError as e:
    print(f"Error reading sleep.csv: {e}")

# Add dataset identifiers
tech_data["Source"] = "tech.csv"
stress_lysis_data["Source"] = "Stress_lysis.csv"
sleep_data["Source"] = "sleep.csv"

# Combine the datasets
all_columns = list(set(tech_data.columns) | set(stress_lysis_data.columns) | set(sleep_data.columns))
tech_data = tech_data.reindex(columns=all_columns)
stress_lysis_data = stress_lysis_data.reindex(columns=all_columns)
sleep_data = sleep_data.reindex(columns=all_columns)

combined_data = pd.concat([tech_data, stress_lysis_data, sleep_data], ignore_index=True)

# Save to a new CSV
combined_data.to_csv("combined_dataset.csv", index=False)

print("Datasets combined successfully into 'combined_dataset.csv'")


Datasets combined successfully into 'combined_dataset.csv'


In [5]:
import pandas as pd

# Load the combined dataset
combined_data = pd.read_csv("combined_dataset.csv")

# Drop unnecessary columns (e.g., unnamed ones)
combined_data = combined_data.loc[:, ~combined_data.columns.str.contains('^Unnamed')]

# Handle redundant columns by renaming or merging
if "Stress Level" in combined_data.columns:
    combined_data["Stress_Level"] = combined_data["Stress_Level"].combine_first(combined_data["Stress Level"])
    combined_data.drop(columns=["Stress Level"], inplace=True)

# Standardize column names (remove spaces, capitalize properly)
combined_data.columns = [col.strip().replace(" ", "_").title() for col in combined_data.columns]

# Reorganize columns (optional, for readability)
main_columns = ["User_Id", "Person_Id", "Gender", "Age", "Occupation", "Stress_Level"]
health_columns = ["Mental_Health_Status", "Quality_Of_Sleep", "Sleep_Hours", "Physical_Activity_Level"]
activity_columns = ["Technology_Usage_Hours", "Social_Media_Usage_Hours", "Gaming_Hours", "Screen_Time_Hours"]
environment_columns = ["Humidity", "Temperature", "Step_Count", "Work_Environment_Impact", "Support_Systems_Access", "Online_Support_Usage"]

ordered_columns = main_columns + health_columns + activity_columns + environment_columns
combined_data = combined_data.reindex(columns=ordered_columns)

# Fill missing values (optional)
combined_data.fillna("N/A", inplace=True)  # Replace NaN with "N/A" for visibility.

# Save the cleaned dataset
combined_data.to_csv("cleaned_combined_dataset.csv", index=False)

print("Dataset cleaned and saved as 'cleaned_combined_dataset.csv'")


Dataset cleaned and saved as 'cleaned_combined_dataset.csv'


  combined_data.fillna("N/A", inplace=True)  # Replace NaN with "N/A" for visibility.
