In [1]:
import pandas as pd
import numpy as np

# Loading Dataset
To load the dataset, we can use the `read_csv()` function from the pandas library :

```
python
df_part1 = pd.read_csv('./Sleep_health_and_lifestyle_dataset.csv')
df_part2 = pd.read_csv('./Sleep_health_and_lifestyle_dataset_part_2.csv')
```
These two lines of code will load the dataset from the CSV files `Sleep_health_and_lifestyle_dataset.csv` and `Sleep_health_and_lifestyle_dataset_part_2.csv` respectively and store them in the variables `df_part1` and `df_part2`.


In [2]:
df_part1 = pd.read_csv('./Sleep_health_and_lifestyle_dataset.csv')
df_part2 = pd.read_csv('./Sleep_health_and_lifestyle_dataset_part_2.csv')

# Merging two parts of the dataset to create a single one

In [None]:
df = pd.concat((df_part1, df_part2))
print(df.shape)
df.head()

## Exploring the data set

In [None]:
df.info()

In [None]:
df.describe()

# Dropping Irrelevant Feature Person ID

In [6]:
df.drop('Person ID', axis=1, inplace=True)

# Handling NA values  

In [None]:
df.isnull().sum()

### NA values found in 'Sleep Disorder' indicates that the person doesn't have any sleep disorders, so instead of treating it as NA value we will assign 'None' to it 

In [8]:
df['Sleep Disorder'] = df['Sleep Disorder'].fillna('None')

In [None]:
df.isnull().sum()

In [None]:
df

## Handling the 'Blood Pressure' column by splitting it into Low BP and High BP

In [11]:
lowRow = []
highRow = []


for val in df['Blood Pressure']:
    sp = val.split('/')

    highRow.append(int(sp[0]))
    lowRow.append(int(sp[1]))

df['LowBP'] = np.array(lowRow)
df['HighBP'] = np.array(highRow)


In [12]:
df = df.drop('Blood Pressure', axis = 1)

In [None]:
df

## Using LabelEncoders to Encode the Categorical Data
#### We also store a dictionay of encoders so that we can also Transform the unseen data into the same format

In [14]:
from sklearn.preprocessing import LabelEncoder

In [None]:
catData = [key for key in df if (df[key].dtype == object)]
numData = [key for key in df if (df[key].dtype != object and key != 'Stress Level')]
print('categorical features are :',catData)

In [None]:
label_encoding_dict = {}
for category in catData:
    labelEncoder = LabelEncoder()
    labelEncoder.fit(df[category])
    df[category] = labelEncoder.transform(df[category])
    label_encoding_dict[category] = labelEncoder
df

# Distribution of target features

In [None]:
import matplotlib.pyplot as plt
count=df['Sleep Disorder'].value_counts()
plt.bar(label_encoding_dict['Sleep Disorder'].inverse_transform(count.keys()),count)
plt.xlabel('Sleep Disorder')
plt.ylabel('Number of people')
plt.show()

# Boxplot of Features

In [None]:
import matplotlib.pyplot as plt

columns = ['Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'BMI Category', 'Heart Rate', 'Daily Steps', 'LowBP', 'HighBP']

fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(15, 10))

for i, column in enumerate(columns):
    ax = axes[i // 4, i % 4]
    ax.boxplot(df[column])
    ax.set_title(column)

plt.tight_layout()
plt.show()

## Correlation analysis

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Calculate the correlation matrix
corr_matrix = df.corr()

# Plot the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Features')
plt.show()

The correlation heatmap provides insights into the relationships between different features in the dataset

# Standardisation of Data

In [None]:
# Standardize the data
from sklearn.preprocessing import StandardScaler
for f in numData:
    df[f] = StandardScaler().fit_transform(df[[f]])
df

In [None]:
label_encoding_dict['Sleep Disorder'].inverse_transform([0])

In [44]:
from sklearn.decomposition import PCA

In [None]:
df

In [None]:
from sklearn import svm


In [None]:
X = df.drop(columns=['Stress Level','Sleep Disorder'])
y = df[['Stress Level','Sleep Disorder']]
y1 = y.iloc[:,0]
y2 = y.iloc[:,1]
X.shape

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y2, test_size=0.3, random_state=42)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
clf = SVC(kernel ='rbf')
 # training set in x, y axis
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Classification Report',classification_report(y_test,y_pred))

In [None]:
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay


# plot the decision function
ax = plt.gca()
DecisionBoundaryDisplay.from_estimator(

    clf, X.values, response_method="predict",

    alpha=0.5,
    ax = ax,

)
plt.scatter(x1,x2, c=train['target'].values, s=30)
DecisionBoundaryDisplay.from_estimator(
    clf,
    X.values,
    plot_method="contour",
    colors="k",
    levels=[-1, 0, 1],
    alpha=0.5,
    linestyles=["--", "-", "--"],
    ax=ax
   
)
# # plot support vectors
# ax.scatter(
#     clf.support_vectors_[:, 0],
#     clf.support_vectors_[:, 1],
#     s=100,
#     linewidth=1,
#     facecolors="none",
#     edgecolors="k",
#    c = train['target'].values
# )
plt.show()

In [40]:
from sklearn.decomposition import PCA
pca = PCA(n_components=11)
pca.fit(X)
t = pca.explained_variance_

In [None]:
plt.bar(X.columns,t)