In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib


In [None]:
# Load the datasets 
activity = pd.read_csv(r'C:\Users\LENOVO\Desktop\Dataset\activity.csv')
labels = pd.read_csv(r'C:\Users\LENOVO\Desktop\Dataset\labels.csv')
physiology = pd.read_csv(r'C:\Users\LENOVO\Desktop\Dataset\physiology.csv')
sleep = pd.read_csv(r'C:\Users\LENOVO\Desktop\Dataset\sleep.csv')
demographics = pd.read_csv(r'C:\Users\LENOVO\Desktop\Dataset\demographics.csv')

# Explore the datasets
print("Activity Data:\n", activity.head(), "\n")
print("Labels Data:\n", labels.head(), "\n")
print("Physiology Data:\n", physiology.head(), "\n")
print("Sleep Data:\n", sleep.head(), "\n")
print("Demographics Data:\n", demographics.head(), "\n")


In [None]:
import os
import pandas as pd

# Define file paths for each cleaned CSV file (update with actual paths)
csv_files = {
    'sleep': r'C:\Users\LENOVO\Desktop\cleaned\cleaned_sleep.csv',
    'activity': r'C:\Users\LENOVO\Desktop\cleaned\cleaned_activity.csv',
    'physiology': r'C:\Users\LENOVO\Desktop\cleaned\cleaned_physiology.csv',
    'labels': r'C:\Users\LENOVO\Desktop\cleaned\cleaned_labels.csv',
    'demographics': r'C:\Users\LENOVO\Desktop\cleaned\cleaned_demographics.csv'
}

# Check if files exist
for key, file_path in csv_files.items():
    if not os.path.isfile(file_path):
        print(f"File not found: {file_path}")

# Load cleaned CSVs into DataFrames
dfs = {}
for key, file_path in csv_files.items():
    if os.path.isfile(file_path):
        print(f"Loading {file_path}...")
        dfs[key] = pd.read_csv(file_path)
        print(f"Loaded {key} with shape: {dfs[key].shape}")

# Check column names
print("\nColumn names in each DataFrame:")
for key, df in dfs.items():
    print(f"{key}: {df.columns}")

# Convert 'date' columns to datetime
for key in dfs.keys():
    if 'date' in dfs[key].columns:
        dfs[key]['date'] = pd.to_datetime(dfs[key]['date'], errors='coerce')

# Print sample data
for key, df in dfs.items():
    print(f"\nSample data from {key}:")
    print(df[['patient_id', 'date']].head())

# Check unique (patient_id, date) pairs
for key, df in dfs.items():
    if 'date' in df.columns:
        unique_pairs = df[['patient_id', 'date']].drop_duplicates().shape[0]
        print(f"Unique (patient_id, date) pairs in {key}: {unique_pairs}")
    else:
        print(f"No 'date' column in {key}")

# Print unique patient_ids and dates
for key, df in dfs.items():
    print(f"\nUnique patient_ids in {key}: {df['patient_id'].nunique()}")
    if 'date' in df.columns:
        print(f"Unique dates in {key}: {df['date'].nunique()}")

# Merge relevant DataFrames based on patient_id and date
try:
    merged_df = pd.merge(dfs['sleep'], dfs['activity'], on=['patient_id', 'date'], how='inner')
    print("\nMerged sleep and activity data with shape:", merged_df.shape)
    merged_df = pd.merge(merged_df, dfs['physiology'], on=['patient_id', 'date'], how='inner')
    print("Merged with physiology data, new shape:", merged_df.shape)
    merged_df = pd.merge(merged_df, dfs['labels'], on=['patient_id', 'date'], how='inner')
    print("Merged with labels data, new shape:", merged_df.shape)
    merged_df = pd.merge(merged_df, dfs['demographics'], on='patient_id', how='left')  # Assuming demographics is a single record per patient
    print("Merged with demographics data, new shape:", merged_df.shape)
except KeyError as e:
    print(f"Merge failed due to missing key: {e}")
    raise

# Display merged DataFrame information
print("\nMerged DataFrame Info:")
print(merged_df.info())

# Save merged DataFrame to a new CSV file
merged_file_path = r'C:\Users\LENOVO\Desktop\Dataset\merged_data.csv'
merged_df.to_csv(merged_file_path, index=False)
print(f"Merged data saved to {merged_file_path}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the merged data
merged_file_path = r'C:\Users\LENOVO\Desktop\Dataset\merged_data.csv'
merged_df = pd.read_csv(merged_file_path)

# Display basic information and statistics about the merged DataFrame
print("Basic Information about the Merged DataFrame:")
print(merged_df.info())
print("\nDescriptive Statistics:")
print(merged_df.describe(include='all'))

# Plotting distributions of key features
plt.figure(figsize=(12, 8))
sns.histplot(data=merged_df, x='age', bins=20, kde=True)
plt.title('Age Distribution')
plt.show()

plt.figure(figsize=(12, 8))
sns.countplot(data=merged_df, x='sex')
plt.title('Gender Distribution')
plt.show()

# Time series analysis of physiological data
plt.figure(figsize=(14, 7))
sns.lineplot(data=merged_df, x='date', y='heart_rate', hue='patient_id', legend=None)
plt.title('Heart Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Heart Rate')
plt.show()

plt.figure(figsize=(14, 7))
sns.lineplot(data=merged_df, x='date', y='respiratory_rate', hue='patient_id', legend=None)
plt.title('Respiratory Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Respiratory Rate')
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = merged_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Identify and visualize missing values
plt.figure(figsize=(12, 8))
sns.heatmap(merged_df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()
