# Day 3: Load, Clean, and Explore Your Data

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the data
df = pd.read_csv('../data/raw/mock_survey_data.csv')
df.head()


In [None]:

# Check types and nulls
df.info()
df.isnull().sum()


In [None]:

# Clean up column types
df['mental_health_history'] = df['mental_health_history'].map({'Yes': 1, 'No': 0})
df['gender'] = df['gender'].astype('category')
df['occupation'] = df['occupation'].astype('category')

# Drop rows with missing values
df.dropna(inplace=True)


In [None]:

# Plot class balance
sns.countplot(x='risk', data=df)
plt.title('Risk Distribution')
plt.show()


In [None]:

# Plot distributions of numerical features
df[['sleep_hours', 'social_media_minutes', 'activity_level', 'emotional_score']].hist(bins=15, figsize=(12, 8))
plt.tight_layout()
plt.show()


In [None]:

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()


In [None]:

# OPTIONAL: Add sentiment score from post text
from textblob import TextBlob

def get_sentiment(text):
    return TextBlob(str(text)).sentiment.polarity

df['sentiment_score'] = df['post_text'].apply(get_sentiment)
df[['post_text', 'sentiment_score']].head()


In [None]:

# Save processed data
df.to_csv('../data/processed/cleaned_survey_data.csv', index=False)
