In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Load dataset (change path if needed)
df = pd.read_csv("/kaggle/input/student-sleep-patterns/student_sleep_patterns.csv")

# Display basic info and check for missing values
df.info()
df.describe()
df.isnull().sum()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df.hist(figsize=(12, 10), bins=20, edgecolor="black")
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()


In [None]:
import pandas as pd

# Load the dataset
file_path = '/kaggle/input/student-sleep-patterns/student_sleep_patterns.csv'
data = pd.read_csv(file_path)

# Display the first few rows and basic information
print(data.head())
print(data.info())
print(data.describe())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram of Sleep Duration
plt.figure(figsize=(8, 6))
sns.histplot(data['Sleep_Duration'], kde=True, color='blue')
plt.title('Distribution of Sleep Duration')
plt.xlabel('Sleep Duration (hours)')
plt.ylabel('Frequency')
plt.show()

# Scatter plot of Study Hours vs Sleep Duration
plt.figure(figsize=(8, 6))
sns.scatterplot(x=data['Study_Hours'], y=data['Sleep_Duration'])
plt.title('Study Hours vs Sleep Duration')
plt.xlabel('Study Hours')
plt.ylabel('Sleep Duration')
plt.show()

# Boxplot for Sleep Quality by Gender
plt.figure(figsize=(8, 6))
sns.boxplot(x=data['Gender'], y=data['Sleep_Quality'])
plt.title('Sleep Quality by Gender')
plt.xlabel('Gender')
plt.ylabel('Sleep Quality')
plt.show()


In [None]:
# Select only numeric columns for correlation analysis
numeric_data = data.select_dtypes(include=['float64', 'int64'])

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_data.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title('Correlation Heatmap')
plt.show()


In [None]:

# Grouped Aggregations
# Example: Aggregations by categorical features
if 'Gender' in data.columns and 'Sleep_Quality' in data.columns:  # Replace with relevant columns
    grouped = data.groupby('Gender')['Sleep_Quality'].mean()
    print("\nAverage Sleep Quality by Gender:\n")
    print(grouped)

    # Plot the aggregation
    grouped.plot(kind='bar', title='Average Sleep Quality by Gender', xlabel='Gender', ylabel='Avg Sleep Quality')
    plt.show()

# Sleep Quality Distribution (Example specific to your dataset)
if 'Sleep_Quality' in data.columns:
    sns.histplot(data['Sleep_Quality'], kde=True, bins=20)
    plt.title('Sleep Quality Distribution')
    plt.xlabel('Sleep Quality')
    plt.ylabel('Frequency')
    plt.show()


In [None]:
# Check for missing values
print(data.isnull().sum())

In [None]:
# Boxplot for Sleep Duration to detect outliers
plt.figure(figsize=(8, 6))
sns.boxplot(data['Sleep_Duration'], color='red')
plt.title('Outlier Detection: Sleep Duration')
plt.xlabel('Sleep Duration')
plt.show()


In [None]:
# Count plot for categorical features
plt.figure(figsize=(8, 6))
sns.countplot(x='Gender', data=data, palette='viridis')
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()


In [None]:
# Check for missing values
print(data.isnull().sum())

# Example: Fill missing numerical values with the mean
data.fillna(data.mean(numeric_only=True), inplace=True)

# For categorical features, fill with the mode
for column in data.select_dtypes(include=['object']).columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

# Verify missing values are handled
print(data.isnull().sum())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = encoder.fit_transform(data[column])

# Verify encoding
print(data.head())

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale numerical features
scaler = StandardScaler()
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Verify scaling
print(data.describe())


In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
# Replace 'Sleep_Quality' with the column you want to predict
X = data.drop(columns=['Sleep_Quality'])
y = data['Sleep_Quality']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Train a Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Score:", r2)


In [None]:
#STREAMLIT CODE
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

# Set the title and layout
st.set_page_config(page_title="Student Sleep Patterns Analysis", layout="wide")

# Page Navigation
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Introduction", "EDA", "Model Building", "Prediction", "Conclusion"])

# Upload Dataset
if "data" not in st.session_state:
    uploaded_file = st.sidebar.file_uploader("Upload your CSV file", type="csv")
    if uploaded_file is not None:
        st.session_state.data = pd.read_csv(uploaded_file)

data = st.session_state.get("data", None)

# Page 1: Introduction
if page == "Introduction":
    st.title("Student Sleep Patterns Analysis and Prediction App")
    st.write("""
    Welcome to the **Student Sleep Patterns Analysis App**!

    This application helps analyze sleep patterns among students and predict sleep quality using machine learning.

    ### Use Cases:
    - **Explore Sleep Trends**: Gain insights into lifestyle habits affecting sleep quality.
    - **Predict Sleep Quality**: Use a trained ML model to predict sleep quality.
    - **Improve Sleep Habits**: Identify areas for improvement in sleep patterns.

    ### Features:
    1. Upload and explore datasets.
    2. Visualize trends with interactive EDA tools.
    3. Build and evaluate ML models.
    4. Predict sleep quality using personalized inputs.
    """)

# Page 2: EDA
elif page == "EDA":
    st.title("Exploratory Data Analysis (EDA)")
    if data is not None:
        st.subheader("Dataset Overview")
        st.write(data.head())

        st.subheader("Summary Statistics")
        st.write(data.describe())

        st.subheader("Missing Values")
        st.write(data.isnull().sum())

        st.subheader("Data Types and Unique Values")
        st.write(data.dtypes)
        st.write(data.nunique())

        st.subheader("Feature Distributions")
        fig, ax = plt.subplots(figsize=(12, 10))
        data.hist(bins=20, edgecolor="black", ax=ax, figsize=(12, 10))
        plt.suptitle("Feature Distributions", fontsize=16)
        st.pyplot(fig)

        st.subheader("Correlation Heatmap")
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.heatmap(data.corr(numeric_only=True), annot=True, cmap="coolwarm", ax=ax)
        st.pyplot(fig)

        st.subheader("Grouped Aggregations")
        if "Gender" in data.columns and "Sleep_Quality" in data.columns:
            grouped = data.groupby("Gender")["Sleep_Quality"].mean()
            st.bar_chart(grouped)

        st.subheader("Sleep Quality Distribution")
        fig, ax = plt.subplots()
        sns.histplot(data["Sleep_Quality"], kde=True, bins=10, ax=ax)
        ax.set_title("Sleep Quality Distribution")
        st.pyplot(fig)
    else:
        st.warning("Please upload a dataset to perform EDA.")

# Page 3: Model Building
elif page == "Model Building":
    st.title("Model Building")
    if data is not None:
        st.subheader("Train a Model")

        # Preprocess the data
        data_encoded = data.copy()

        # Encode categorical features
        if "Gender" in data.columns:
            data_encoded["Gender"] = data_encoded["Gender"].map({"Male": 0, "Female": 1, "Other": 2})
        if "University_Year" in data.columns:
            year_mapping = {"1st Year": 1, "2nd Year": 2, "3rd Year": 3, "4th Year": 4}
            data_encoded["University_Year"] = data_encoded["University_Year"].map(year_mapping)

        # Define features (X) and target (y)
        X = data_encoded.drop(["Student_ID", "Sleep_Quality"], axis=1, errors="ignore")
        y = data_encoded["Sleep_Quality"]

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train a Random Forest model
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        st.session_state.model = model

        st.write("### Model Performance")
        st.write(f"**Accuracy:** {accuracy:.4f}")
        st.write(f"**Mean Squared Error:** {mse:.4f}")
        st.write(f"**R-squared Score:** {r2:.4f}")

        st.success("Model trained successfully!")
    else:
        st.warning("Please upload a dataset to build the model.")

# Page 4: Prediction
elif page == "Prediction":
    st.title("Prediction")
    if data is not None and "model" in st.session_state:
        model = st.session_state.model
        st.write("Enter the following details to predict sleep quality:")

        # Inputs
        age = st.number_input("Age", min_value=18, max_value=25, value=21)
        gender = st.selectbox("Gender", ['Male', 'Female', 'Other'])
        university_year = st.selectbox("University Year", ['1st Year', '2nd Year', '3rd Year', '4th Year'])
        sleep_duration = st.number_input("Sleep Duration (hours)", min_value=4.0, max_value=9.0, value=6.5)
        study_hours = st.number_input("Study Hours (per day)", min_value=0.0, max_value=12.0, value=5.0)
        screen_time = st.number_input("Screen Time (hours)", min_value=1.0, max_value=5.0, value=2.5)
        caffeine_intake = st.number_input("Caffeine Intake (cups)", min_value=0, max_value=5, value=2)
        physical_activity = st.number_input("Physical Activity (minutes)", min_value=0, max_value=120, value=60)
        weekday_sleep_start = st.number_input("Weekday Sleep Start (24-hour format)", min_value=0.0, max_value=24.0, value=22.0)
        weekend_sleep_start = st.number_input("Weekend Sleep Start (24-hour format)", min_value=0.0, max_value=24.0, value=23.0)
        weekday_sleep_end = st.number_input("Weekday Sleep End (24-hour format)", min_value=0.0, max_value=24.0, value=6.0)
        weekend_sleep_end = st.number_input("Weekend Sleep End (24-hour format)", min_value=0.0, max_value=24.0, value=8.0)

        # Prepare input for prediction
        gender_mapping = {'Male': 0, 'Female': 1, 'Other': 2}
        year_mapping = {'1st Year': 1, '2nd Year': 2, '3rd Year': 3, '4th Year': 4}

        input_data = pd.DataFrame({
            'Age': [age],
            'Gender': [gender_mapping[gender]],
            'University_Year': [year_mapping[university_year]],
            'Sleep_Duration': [sleep_duration],
            'Study_Hours': [study_hours],
            'Screen_Time': [screen_time],
            'Caffeine_Intake': [caffeine_intake],
            'Physical_Activity': [physical_activity],
            'Weekday_Sleep_Start': [weekday_sleep_start],
            'Weekend_Sleep_Start': [weekend_sleep_start],
            'Weekday_Sleep_End': [weekday_sleep_end],
            'Weekend_Sleep_End': [weekend_sleep_end]
        })

        # Debugging: Display input data for verification
        st.subheader("Input Data for Prediction")
        st.write(input_data)

        # Prediction
        if st.button("Predict"):
            try:
                prediction = model.predict(input_data)
                st.success(f"Predicted Sleep Quality: {prediction[0]}")
            except Exception as e:
                st.error(f"Prediction failed: {e}")
    else:
        st.warning("Please upload a dataset and build the model first.")

# Page 5: Conclusion
elif page == "Conclusion":
    st.title("Conclusion")
    st.markdown("""
    ### Key Takeaways:
    - Sleep quality is influenced by factors such as sleep duration, study hours, and screen time.
    - The Random Forest model provided reasonable predictions for sleep quality with a good balance of accuracy, MSE, and R-squared score.
    - This project demonstrates the potential of machine learning in analyzing and predicting human behavior patterns.

    ### Future Directions:
    - Enhance the model with more features and data.
    - Incorporate advanced algorithms and hyperparameter tuning.
    - Extend the app to suggest improvements for better sleep quality.
    """)
