# Sleep Analysis with inertial wrist-worn sensors (photoplethysmographic(PPG) sensor)

## Importing Necessary libraries

In [1]:
import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from IPython.display import display

## Loading the datasets

### Statistical features dataset

In [2]:
STATISTICAL_FEATURES = pd.read_excel("C:/Users/chspr/Downloads/SleepStagingStatisticalFeatures.xlsx")

display(STATISTICAL_FEATURES.head())
STATISTICAL_FEATURES.info()

Unnamed: 0,SubNo,SegNo,MeanAbsDev,MedianAbsDev,InterquartileRange,centralMoment,averageCurveLength,averageEnergy,averageTeagerEnergy,shapeFactor,...,PoincareSD2,ratioSD1SD2,CCM,HjorthActivity,HjorthMobility,HjorthComplexity,lam,HFD,KFD,Class
0,1,1,0.912767,0.001655,2.192423,22.657364,-6.406334e-16,1.0,0.99987,0.149008,...,1411.941562,0.039988,1.227427e-08,1.0,0.010411,41.02153,136.561423,1.300122,1.000006,6
1,1,2,0.75676,0.558465,1.127156,1432.670278,2.310716e-12,1.0,0.99987,-0.030211,...,915.793357,1.033208,3.773205e-07,1.0,1.329995,1.251437,84.029234,1.994388,1.055329,6
2,1,3,0.75976,0.556348,1.112695,1753.272469,-3.698185e-12,1.0,0.99987,-0.034199,...,996.863622,1.002636,3.976335e-07,1.0,1.323966,1.264635,83.635191,1.997516,1.055113,6
3,1,4,0.761878,0.570325,1.133696,2110.583808,4.579237e-12,1.0,0.99987,-0.031477,...,1017.009484,0.920768,3.48581e-07,1.0,1.328367,1.260437,87.537957,1.995013,1.055506,6
4,1,5,0.905395,0.003289,2.193957,26.877642,-3.697794e-15,1.0,0.99987,-0.157957,...,1411.574303,0.044405,1.406714e-08,1.0,0.011125,57.923784,112.121415,1.39054,1.000007,6


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246 entries, 0 to 1245
Data columns (total 33 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   SubNo                1246 non-null   int64  
 1   SegNo                1246 non-null   int64  
 2   MeanAbsDev           1246 non-null   float64
 3   MedianAbsDev         1246 non-null   float64
 4   InterquartileRange   1246 non-null   float64
 5   centralMoment        1246 non-null   float64
 6   averageCurveLength   1246 non-null   float64
 7   averageEnergy        1246 non-null   float64
 8   averageTeagerEnergy  1246 non-null   float64
 9   shapeFactor          1246 non-null   float64
 10  meanValue            1246 non-null   float64
 11  sdValue              1246 non-null   float64
 12  rmsValue             1246 non-null   float64
 13  tmean25              1246 non-null   float64
 14  tmean50              1246 non-null   float64
 15  geometricMean        1246 non-null   f

these are the statistical features which are extracted from photoplethysmographic(PPG) sensor

### Cardio- respiratory features dataset

In [3]:
CARDIO_RESPIRATORY_FEATURES = pd.read_excel("C:/Users/chspr/Downloads/SleepStagingCardioRespiratoryFeatures.xlsx")

display(CARDIO_RESPIRATORY_FEATURES.head())
CARDIO_RESPIRATORY_FEATURES.info()

Unnamed: 0,SubNo,SegNo,MeanAbsDev,MedianAbsDev,InterquartileRange,centralMoment,averageCurveLength,averageEnergy,averageTeagerEnergy,shapeFactor,...,PoincareSD2,ratioSD1SD2,CCM,HjorthActivity,HjorthMobility,HjorthComplexity,lam,HFD,KFD,Class
0,1,1,0.912767,0.001655,2.192423,22.657364,-6.406334e-16,1.0,0.99987,0.149008,...,1411.941562,0.039988,1.227427e-08,1.0,0.010411,41.02153,136.561423,1.300122,1.000006,6
1,1,2,0.75676,0.558465,1.127156,1432.670278,2.310716e-12,1.0,0.99987,-0.030211,...,915.793357,1.033208,3.773205e-07,1.0,1.329995,1.251437,84.029234,1.994388,1.055329,6
2,1,3,0.75976,0.556348,1.112695,1753.272469,-3.698185e-12,1.0,0.99987,-0.034199,...,996.863622,1.002636,3.976335e-07,1.0,1.323966,1.264635,83.635191,1.997516,1.055113,6
3,1,4,0.761878,0.570325,1.133696,2110.583808,4.579237e-12,1.0,0.99987,-0.031477,...,1017.009484,0.920768,3.48581e-07,1.0,1.328367,1.260437,87.537957,1.995013,1.055506,6
4,1,5,0.905395,0.003289,2.193957,26.877642,-3.697794e-15,1.0,0.99987,-0.157957,...,1411.574303,0.044405,1.406714e-08,1.0,0.011125,57.923784,112.121415,1.39054,1.000007,6


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246 entries, 0 to 1245
Data columns (total 33 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   SubNo                1246 non-null   int64  
 1   SegNo                1246 non-null   int64  
 2   MeanAbsDev           1246 non-null   float64
 3   MedianAbsDev         1246 non-null   float64
 4   InterquartileRange   1246 non-null   float64
 5   centralMoment        1246 non-null   float64
 6   averageCurveLength   1246 non-null   float64
 7   averageEnergy        1246 non-null   float64
 8   averageTeagerEnergy  1246 non-null   float64
 9   shapeFactor          1246 non-null   float64
 10  meanValue            1246 non-null   float64
 11  sdValue              1246 non-null   float64
 12  rmsValue             1246 non-null   float64
 13  tmean25              1246 non-null   float64
 14  tmean50              1246 non-null   float64
 15  geometricMean        1246 non-null   f

these are the surrogate cardiorespiratory features extracted from photoplethysmographic(PPG) sensor

### Arterial features dataset

In [4]:
ARTERIAL_FEATURES = pd.read_excel("C:/Users/chspr/Downloads/SleepStagingArterialFeatures.xlsx")

display(ARTERIAL_FEATURES.head())
ARTERIAL_FEATURES.info()

Unnamed: 0,SubNo,SegNo,AVppAmp,SDppAmp,SDSDppAmp,RMSSDppAmp,AVpw,SDpw,SDSDpw,RMSSDpw,...,stdArea,meanIPAR,stdIPAR,meanT1,stdT1,meanT2,stdT2,meanIPTR,stdIPTR,Class
0,1,1,0.006709,0.000979,0.000804,0.001411,0.449183,0.132442,0.070278,0.102845,...,0.189781,2.029144,3.688051,26.158333,19.155548,22.216667,14.379067,1.976787,3.628978,6
1,1,2,4.641927,0.707311,0.602373,0.987717,0.415554,0.122198,0.048761,0.08346,...,43.496371,2.604393,4.63497,23.287342,14.096793,23.520253,15.183235,2.645773,4.849708,6
2,1,3,4.576796,0.62015,0.536944,0.855869,0.456689,0.121285,0.06637,0.108959,...,41.207404,1.88772,3.364893,26.347973,15.622294,24.029054,16.438907,1.909369,3.572883,6
3,1,4,4.624681,0.764361,0.62415,1.026587,0.480042,0.138293,0.069621,0.136883,...,58.421931,2.110984,2.799883,24.895333,17.392186,24.774,16.051954,2.14532,2.919238,6
4,1,5,0.019445,0.012603,0.009917,0.011559,0.48041,0.118619,0.069744,0.107834,...,5.80506,2.203302,3.527049,26.808571,15.404135,26.881429,17.412985,2.211676,3.471947,6


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1246 entries, 0 to 1245
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SubNo       1246 non-null   int64  
 1   SegNo       1246 non-null   int64  
 2   AVppAmp     1246 non-null   float64
 3   SDppAmp     1246 non-null   float64
 4   SDSDppAmp   1246 non-null   float64
 5   RMSSDppAmp  1246 non-null   float64
 6   AVpw        1246 non-null   float64
 7   SDpw        1246 non-null   float64
 8   SDSDpw      1246 non-null   float64
 9   RMSSDpw     1246 non-null   float64
 10  meanA1      1246 non-null   float64
 11  stdA1       1246 non-null   float64
 12  meanA2      1246 non-null   float64
 13  stdA2       1246 non-null   float64
 14  meanArea    1246 non-null   float64
 15  stdArea     1246 non-null   float64
 16  meanIPAR    1246 non-null   float64
 17  stdIPAR     1246 non-null   float64
 18  meanT1      1246 non-null   float64
 19  stdT1       1246 non-null  

these are the surrogate arterial blood pressure features extracted from photoplethysmographic(PPG) sensor.

## summary of datasets

### statistical features
The Statistical Features dataset contains 1,246 entries and 33 columns. All the columns have complete data with no missing values. The columns include various statistical measures like Mean Absolute Deviation, Interquartile Range, central moments, and many others. The last column, "Class," seems to represent the sleep stage classification.

#### column description

SubNo:Subject ID,
SegNo: Segment ID,
MeanAbsDev: mean absolute deviation of PPG signal (mV),
MedianAbsDev: median absolute deviation of PPG signal (mV),
InterquartileRange: Interquartile Range of PPG signal (mV), 
centralMoment: central Moment of PPG signal,
averageCurveLength: average Curve Length of PPG signal,
averageEnergy: average Energy of PPG signal,
averageTeagerEnergy : average Teager Energy of PPG signal,
shapeFactor : shape Factorof PPG signal,
meanValue : mean Value of PPG signal (mV),
sdValue : standard deviation of PPG signal (mV),
rmsValue : root mean squire of PPG signal (mV) ,
tmean25 : 25 % trimmed mean of PPG signal (mV) ,
tmean50 : 50 % trimmed mean of PPG signal (mV),
geometricMean : geometric Mean of PPG signal (mV),
harmonicMean : harmonic Mean of PPG signal (mV) ,
maxValue : max Value of PPG signal (mV) ,
minValue : min Value of PPG signal (mV),
svdPPG : singular value decomposition of PPG signal,
skewPPG : skewness of PPG signal,
kurtPPG : kurtosis of PPG signal ,
PoincareSD1 : Poincare SD1 of PPG signal,
PoincareSD2 : Poincare SD1 of PPG signal ,
ratioSD1SD2 : Ration of.Poincare SD1 and SD2 of PPG signal ,
CCM : Complex correlation measure of PPG signal,
HjorthActivity : Hjorth Activity of PPG signal ,
HjorthMobility : Hjorth Mobility of PPG signal,
HjorthComplexity : Hjorth Complexity of PPG signal,
lam : Lyapunov exponent of PPG signal,
HFD :Higuchi Fractal Dimension of PPG signal ,
KFD : Katz?s fractal dimension of PPG signal ,
Class: Label of class.


### Cardio respiratory features
the cardio respiratory features data contains 1246 entries and 33 columns same as statistical festures. this data types and cloumn names are mostly similar, with only slight differences in some features like svdPPI, skewPPI and kurtPPIU. this dataset also contains class cloumn which represents sleep stages.

#### column description

SubNo:Subject ID ,
SegNo: Segment ID ,
MeanAbsDev: mean absolute deviation of peak to peak interval (second) ,
MedianAbsDev: median absolute deviation of peak to peak interval (second),
InterquartileRange: Interquartile Range of peak to peak interval (second) ,
centralMoment: central Moment of peak to peak interval ,
averageCurveLength: average Curve Length of peak to peak interval ,
averageEnergy: average Energy of peak to peak interval,
averageTeagerEnergy : average Teager Energy of peak to peak interval,
shapeFactor : shape Factor of peak to peak interval,
meanValue : mean Value of peak to peak interval (second),
sdValue : standard deviation of peak to peak interval (second),
rmsValue : root mean squire of peak to peak interval (second),
tmean25 : 25 % trimmed mean of peak to peak interval (second),
tmean50 : 50 % trimmed mean of peak to peak interval (second),
geometricMean : geometric Mean of peak to peak interval (second),
harmonicMean : harmonic Mean of peak to peak interval (second),
maxValue : max Value of peak to peak interval (second),
minValue : min Value of peak to peak interval (second),
svdPPI : singular value decomposition of peak to peak interval,
skewPPI : skewness of peak to peak interval,
kurtPPI : kurtosis of peak to peak interval,
PoincareSD1 : Poincare SD1 of peak to peak interval,
PoincareSD2 : Poincare SD1 of peak to peak interval,
ratioSD1SD2 : Ration of.Poincare SD1 and SD2 of peak to peak interval,
CCM : Complex correlation measure of peak to peak interval,
HjorthActivity : Hjorth Activity of peak to peak interval,
HjorthMobility : Hjorth Mobility of peak to peak interval,
HjorthComplexity : Hjorth Complexity of peak to peak interval,
lam : Lyapunov exponent of peak to peak interval,
HFD :Higuchi Fractal Dimension of peak to peak interval,
KFD : Katz?s fractal dimensionof peak to peak interval,
Class: Label of class.

### Arterial features
the arterial features data contains 1246 entries and 25 columns. this dataaset includes features like AVppAmp, SDppAmp, various time-related features, and arterial properties. class column is present in this dataset also.

#### cloumn description

SubNo:Subject ID,
SegNo: Segment ID,
AVppAmp: Average peak to peak amplitude (mV),
SDppAmp : standard deviation of peak to peak amplitude (mV),
SDSDppAmp: standard deviation of successive difference of peak to peak amplitude (mV),
RMSSDppAmp: Root mean square of successive difference of peak to peak amplitude (mV),
AVpw: Average pulse width (second),
SDpw: standard deviation of pulse width(second),
SDSDpw: standard deviation of successive difference of pulse width (second),
RMSSDpw: Root mean square of successive difference of pulse width (second),
meanA1: average of the systolic area (mV-second),
stdA1: standard deviation of systolic area (mV-second),
meanA2: average of the diastolic area (mV-second),
stdA2: standard deviation of diastolic area (mV-second),
meanArea: average of PPG area (mV-second),
stdArea: standard deviation of PPG area (mV-second),
meanIPAR: average of inflection point area ratio,
stdIPAR: standard deviation of inflection point area ratio,
meanT1: average of systolic time (second),
stdT1: standard deviation of systolic time (second),
meanT2: average of diastolic time (second),
stdT2: standard deviation of diastolic time (second),
meanIPTR: average of inflection point time ratio,
stdIPTR: standard deviation of inflection point time ratio,
Class: Label of class.

## Merging the datasets

In [5]:
# Load the datasets
STATISTICAL_FEATURES_PATH = "C:/Users/chspr/Downloads/SleepStagingStatisticalFeatures.xlsx"
CARDIO_RESPIRATORY_FEATURES_PATH = "C:/Users/chspr/Downloads/SleepStagingCardioRespiratoryFeatures.xlsx"
ARTERIAL_FEATURES_PATH = "C:/Users/chspr/Downloads/SleepStagingArterialFeatures.xlsx"

# Adjusting the sheet names to match between the files for accurate loading
sheet_name_mapping = {
    'Subject 1': 'Subject ID 1',
    'Subject 2': 'Subject ID 2',
    'Subject 3': 'Subject ID 3',
    'Subject 4': 'Subject ID 4',
    'Subject 5': 'Subject ID 5',
    'Subject 6': 'Subject ID 6',
    'Subject 7': 'Subject ID 7',
    'Subject 8': 'Subject ID 8',
    'Subject 9 ': 'Subject ID 9',
    'Subject 10': 'Subject ID 10'
}

def merge_data(sheet_name_stat, sheet_name_cardio_art):
    """
    Merges statistical, cardio-respiratory, and arterial features data for a given subject.

    Args:
        sheet_name_stat (str): The sheet name for statistical features from the STATISTICAL_FEATURES_PATH file.
        sheet_name_cardio_art (str): The sheet name for cardio-respiratory and arterial features from the respective files.

    Returns:
        pd.DataFrame: A merged DataFrame with statistical, cardio-respiratory, and arterial features, with the 'Class' column moved to the end.
    """
    df_stat = pd.read_excel(STATISTICAL_FEATURES_PATH, sheet_name=sheet_name_stat)
    df_cardio = pd.read_excel(CARDIO_RESPIRATORY_FEATURES_PATH, sheet_name=sheet_name_cardio_art)
    df_art = pd.read_excel(ARTERIAL_FEATURES_PATH, sheet_name=sheet_name_cardio_art)
    
    # Merging dataframes on 'SubNo', 'SegNo', and 'Class'
    df_merged = pd.merge(df_stat, df_cardio, on=['SubNo', 'SegNo', 'Class'], suffixes=('_stat', '_cardio'))
    df_merged = pd.merge(df_merged, df_art, on=['SubNo', 'SegNo', 'Class'])
    
    # Moving 'Class' column to the end
    class_col = df_merged.pop('Class')
    df_merged['Class'] = class_col
    
    return df_merged

In [6]:
# Merging sheets for each subject
merged_data = {subject_name: merge_data(subject_name, mapped_name) for subject_name, mapped_name in sheet_name_mapping.items()}


# Saving the merged data to a new Excel file
OUTPUT_EXCEL_PATH = 'C:/Users/chspr/OneDrive/Desktop/SIT782/Merged_SleepStagingData.xlsx'
with pd.ExcelWriter(OUTPUT_EXCEL_PATH, engine='xlsxwriter') as writer:
    for subject_name, data in merged_data.items():
        cleaned_sheet_name = subject_name.strip()
        data.to_excel(writer, sheet_name=cleaned_sheet_name, index=False)

# Confirm the path for download
OUTPUT_EXCEL_PATH

'C:/Users/chspr/OneDrive/Desktop/SIT782/Merged_SleepStagingData.xlsx'

In [7]:
# displaying the head of merged dataset
merged_data['Subject 1'].head()

Unnamed: 0,SubNo,SegNo,MeanAbsDev_stat,MedianAbsDev_stat,InterquartileRange_stat,centralMoment_stat,averageCurveLength_stat,averageEnergy_stat,averageTeagerEnergy_stat,shapeFactor_stat,...,stdArea,meanIPAR,stdIPAR,meanT1,stdT1,meanT2,stdT2,meanIPTR,stdIPTR,Class
0,1,1,0.912767,0.001655,2.192423,22.657364,-6.406334e-16,1.0,0.99987,0.149008,...,0.189781,2.029144,3.688051,26.158333,19.155548,22.216667,14.379067,1.976787,3.628978,6
1,1,2,0.75676,0.558465,1.127156,1432.670278,2.310716e-12,1.0,0.99987,-0.030211,...,43.496371,2.604393,4.63497,23.287342,14.096793,23.520253,15.183235,2.645773,4.849708,6
2,1,3,0.75976,0.556348,1.112695,1753.272469,-3.698185e-12,1.0,0.99987,-0.034199,...,41.207404,1.88772,3.364893,26.347973,15.622294,24.029054,16.438907,1.909369,3.572883,6
3,1,4,0.761878,0.570325,1.133696,2110.583808,4.579237e-12,1.0,0.99987,-0.031477,...,58.421931,2.110984,2.799883,24.895333,17.392186,24.774,16.051954,2.14532,2.919238,6
4,1,5,0.905395,0.003289,2.193957,26.877642,-3.697794e-15,1.0,0.99987,-0.157957,...,5.80506,2.203302,3.527049,26.808571,15.404135,26.881429,17.412985,2.211676,3.471947,6


### checking for missing values

In [8]:
missing_values_summary = {sheet_name: data.isnull().sum().sum() for sheet_name, data in merged_data.items()}
missing_values_summary

{'Subject 1': 0,
 'Subject 2': 0,
 'Subject 3': 0,
 'Subject 4': 0,
 'Subject 5': 0,
 'Subject 6': 0,
 'Subject 7': 0,
 'Subject 8': 0,
 'Subject 9 ': 0,
 'Subject 10': 0}

## data preprocessing

the data in the given features are of different scales. so, we use standardization technique to normalise the data, where each feature will be scaled to have a mean of 0 and a standard deviation of 1. we are not normalising the class feature as it represents categorical label.

In [9]:
# Initializing the MinMaxScaler
scaler = MinMaxScaler()

# Apply normalization to each sheet in the dataset
normalized_data = {}

for sheet_name, data in merged_data.items():
    # Select the features for normalization (excluding SubNo, SegNo, Class)
    features_to_scale = data.drop(columns=['SubNo', 'SegNo', 'Class'])
    
    # Apply Min-Max scaling
    scaled_features = scaler.fit_transform(features_to_scale)
    
    # Create a new DataFrame with normalized features
    normalized_df = pd.DataFrame(scaled_features, columns=features_to_scale.columns)
    
    # Add the SubNo, SegNo, and Class columns back to the DataFrame
    normalized_df['SubNo'] = data['SubNo'].values
    normalized_df['SegNo'] = data['SegNo'].values
    normalized_df['Class'] = data['Class'].values
    
    # Reorder columns to match original order
    normalized_df = normalized_df[['SubNo', 'SegNo'] + list(features_to_scale.columns) + ['Class']]
    
    # Store the normalized DataFrame
    normalized_data[sheet_name] = normalized_df

# Display the first few rows of the normalized data for the first sheet
normalized_data['Subject 1'].head()

Unnamed: 0,SubNo,SegNo,MeanAbsDev_stat,MedianAbsDev_stat,InterquartileRange_stat,centralMoment_stat,averageCurveLength_stat,averageEnergy_stat,averageTeagerEnergy_stat,shapeFactor_stat,...,stdArea,meanIPAR,stdIPAR,meanT1,stdT1,meanT2,stdT2,meanIPTR,stdIPTR,Class
0,1,1,1.0,0.0,0.998804,0.00028,0.446702,0.390625,0.390625,1.0,...,0.0,0.002556,0.000923,0.030975,0.348986,0.0,0.227683,0.590028,0.301126,6
1,1,2,0.321775,0.599784,0.168419,0.052901,0.725939,0.484375,0.359375,0.416158,...,0.296637,0.003422,0.001161,0.0,0.251195,0.016368,0.241006,0.82931,0.402638,6
2,1,3,0.334816,0.597503,0.157147,0.064866,0.0,0.296875,0.078125,0.403166,...,0.280958,0.002343,0.000842,0.033021,0.280684,0.022756,0.261809,0.565914,0.296461,6
3,1,4,0.344024,0.612559,0.173517,0.078201,1.0,0.21875,0.15625,0.412034,...,0.398872,0.002679,0.0007,0.017348,0.314898,0.03211,0.255398,0.650309,0.242105,6
4,1,5,0.967948,0.00176,1.0,0.000438,0.446333,0.453125,0.484375,0.0,...,0.038463,0.002818,0.000883,0.03799,0.276467,0.058571,0.277946,0.674043,0.288067,6


## Finding Outliers

There are many ways to find outliers in the dataset. here we are using Z-Scores to find the outliers. we will calculate the Z-Scores for all numerical columns, and identify any data points that have a Z-score less than -3 or greater than 3.

In [10]:
# Applying Z-score calculation for each sheet and identifying outliers
outlier_data = {}

for sheet_name, data in normalized_data.items():
    numerical_cols = data.select_dtypes(include='number').columns
    z_scores = data[numerical_cols].apply(zscore)
    
    # checking outliers for numerical column with a Z-score greater than 3 or less than -3
    outliers = (z_scores.abs() > 3).any(axis=1)
    
    # storing the outliers
    outlier_data[sheet_name] = data[outliers]

# Display the number of outliers
outlier_data['Subject 1'], outlier_data['Subject 1'].shape

(      SubNo  SegNo  MeanAbsDev_stat  MedianAbsDev_stat  \
 0         1      1         1.000000           0.000000   
 1         1      2         0.321775           0.599784   
 2         1      3         0.334816           0.597503   
 3         1      4         0.344024           0.612559   
 4         1      5         0.967948           0.001760   
 ...     ...    ...              ...                ...   
 1177      1   1178         0.479719           0.674285   
 1181      1   1182         0.625460           0.783979   
 1217      1   1218         0.581697           0.763972   
 1229      1   1230         0.393835           0.615417   
 1238      1   1239         0.636963           0.812169   
 
       InterquartileRange_stat  centralMoment_stat  averageCurveLength_stat  \
 0                    0.998804            0.000280                 0.446702   
 1                    0.168419            0.052901                 0.725939   
 2                    0.157147            0.064866   

### Handling the outliers

now to handle outliers we replace values where the Z-score is greater than 3 with the value corresponding to a Z-score of 3, and values less than -3 with the value at -3.

In [11]:
from scipy.stats import zscore

# Defining a function to cap values based on Z-score thresholds
def cap_values(input_data, z_threshold=3):
    """
    Caps values in numerical columns based on a Z-score threshold.
    
    Args:
        input_data (pd.DataFrame): The input DataFrame containing numerical columns.
        z_threshold (float): The Z-score threshold for capping values. Defaults to 3.
    
    Returns:
        pd.DataFrame: A DataFrame with capped values for numerical columns.
    """
    numerical_columns = input_data.select_dtypes(include='number').columns
    column_z_scores = input_data[numerical_columns].apply(zscore)
    
    # Calculating the capping thresholds
    cap_high = input_data[numerical_columns] + (z_threshold - column_z_scores) * input_data[numerical_columns].std(ddof=0)
    cap_low = input_data[numerical_columns] - (column_z_scores + z_threshold) * input_data[numerical_columns].std(ddof=0)
    
    # Applying capping
    data_capped = input_data.copy()
    data_capped[numerical_columns] = input_data[numerical_columns].where(~(column_z_scores > z_threshold), other=cap_high)
    data_capped[numerical_columns] = data_capped[numerical_columns].where(~(column_z_scores < -z_threshold), other=cap_low)
    
    return data_capped

# Applying the capping to each subject's data
capped_data = {sheet_name: cap_values(data) for sheet_name, data in normalized_data.items()}

# Displaying the results
display(capped_data['Subject 1'].head())
display(capped_data['Subject 1'].shape)


Unnamed: 0,SubNo,SegNo,MeanAbsDev_stat,MedianAbsDev_stat,InterquartileRange_stat,centralMoment_stat,averageCurveLength_stat,averageEnergy_stat,averageTeagerEnergy_stat,shapeFactor_stat,...,stdArea,meanIPAR,stdIPAR,meanT1,stdT1,meanT2,stdT2,meanIPTR,stdIPTR,Class
0,1,1,1.0,0.587142,0.889978,0.00028,0.446702,0.390625,0.390625,0.684714,...,0.0,0.002556,0.000923,0.197827,0.348986,0.023252,0.227683,0.35283,0.215816,6
1,1,2,0.342853,0.599784,0.174588,0.052901,0.512592,0.484375,0.359375,0.416158,...,0.296637,0.003422,0.001161,0.197827,0.251195,0.023252,0.241006,0.35283,0.215816,6
2,1,3,0.342853,0.597503,0.174588,0.064866,0.381575,0.296875,0.078125,0.403166,...,0.280958,0.002343,0.000842,0.197827,0.280684,0.023252,0.261809,0.35283,0.215816,6
3,1,4,0.344024,0.612559,0.174588,0.078201,0.512592,0.21875,0.15625,0.412034,...,0.398872,0.002679,0.0007,0.197827,0.314898,0.03211,0.255398,0.35283,0.215816,6
4,1,5,0.967948,0.587142,0.889978,0.000438,0.446333,0.453125,0.484375,0.262362,...,0.038463,0.002818,0.000883,0.197827,0.276467,0.058571,0.277946,0.35283,0.215816,6


(1246, 85)

## building machine learning model

#### Splitting the data into test and train sets

In [12]:
# Assuming 'Class' is the target variable
features = capped_data['Subject 1'].drop('Class', axis=1)
target = capped_data['Subject 1']['Class']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

### Support Vector Machine(SVM)

In [13]:
# Initialize the SVM Classifier
svm_model = SVC(kernel='rbf', C=1.0, random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Save the trained model to a .pkl file
PKL_FILENAME = "C:/Users/chspr/OneDrive/Desktop/SIT782/svm_of_subject1.pkl"
with open(PKL_FILENAME, 'wb') as file:
    pickle.dump(svm_model, file)

# Output the filename where the model is saved
PKL_FILENAME

'C:/Users/chspr/OneDrive/Desktop/SIT782/svm_of_subject1.pkl'

In [14]:
# Predicting the test set results
y_pred_svm = svm_model.predict(X_test)

# Evaluating the model
classification_report(y_test, y_pred_svm)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           1       0.00      0.00      0.00        34\n           2       0.00      0.00      0.00        52\n           3       0.00      0.00      0.00         9\n           4       0.00      0.00      0.00        20\n           5       0.00      0.00      0.00         7\n           6       0.51      1.00      0.68       128\n\n    accuracy                           0.51       250\n   macro avg       0.09      0.17      0.11       250\nweighted avg       0.26      0.51      0.35       250\n'

### Random Forest

In [15]:
# Initialize the RandomForest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model to a .pkl file
PKL_FILENAME = "C:/Users/chspr/OneDrive/Desktop/SIT782/randomforest_of_subject1.pkl"
with open(PKL_FILENAME, 'wb') as file:
    pickle.dump(model, file)

# Output the filename where the model is saved
PKL_FILENAME

'C:/Users/chspr/OneDrive/Desktop/SIT782/randomforest_of_subject1.pkl'

In [16]:
# Predicting the test set results
y_pred = model.predict(X_test)

# Evaluating the model
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           1       0.65      0.38      0.48        34\n           2       0.62      0.69      0.65        52\n           3       0.50      0.11      0.18         9\n           4       0.82      0.90      0.86        20\n           5       1.00      0.14      0.25         7\n           6       0.82      0.94      0.87       128\n\n    accuracy                           0.76       250\n   macro avg       0.73      0.53      0.55       250\nweighted avg       0.75      0.76      0.73       250\n'

The results from the SVM are quite concerning, showing a very poor performance for classes other than class 6, where it achieved a precision of 0.51 with a recall of 1.00, indicating it has predicted most data points as class 6.

On the other hand, the Random forest model shows much better performance eith an accuracy of 76% and across all classes, with particularly strong results for classes 4 and 6. This suggests random forst might be a better fit for this dataset, or it handles the class imbalance more effectively.

### Now we build prediction model for all the subjects combined

In [17]:
# Combine the capped data from all subjects into a single DataFrame for comprehensive analysis
combined_data = pd.concat(capped_data.values(), ignore_index=True)

# Split the data into features and target
X = combined_data.drop(columns=['SubNo', 'SegNo', 'Class'])
y = combined_data['Class']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes to confirm successful split
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7542, 82), (1886, 82), (7542,), (1886,))

#### Logistic regression

In [18]:
# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)  # Increased iterations for better convergence
lr_model.fit(X_train, y_train)

# Save the trained model to a .pkl file
PKL_FILENAME = "C:/Users/chspr/OneDrive/Desktop/SIT782/lr_of_combineddata.pkl"
with open(PKL_FILENAME, 'wb') as file:
    pickle.dump(lr_model, file)

# Output the filename where the model is saved
PKL_FILENAME

# Predict on the test set
lr_predictions = lr_model.predict(X_test)

# Evaluation using classification report
lr_report = classification_report(y_test, lr_predictions)
lr_report

'              precision    recall  f1-score   support\n\n           1       0.44      0.16      0.24       233\n           2       0.45      0.63      0.52       529\n           3       0.36      0.08      0.13       223\n           4       0.40      0.03      0.05        77\n           5       0.54      0.23      0.33       163\n           6       0.60      0.85      0.71       661\n\n    accuracy                           0.53      1886\n   macro avg       0.47      0.33      0.33      1886\nweighted avg       0.50      0.53      0.47      1886\n'

with all the subjects combined we have acheived the accuracy of 53% which is less compared to individual accuracy of subject 1

#### Random forest

In [19]:
# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)

# Save the trained model to a .pkl file
PKL_FILENAME = "C:/Users/chspr/OneDrive/Desktop/SIT782/randomforest_of_combinedata.pkl"
with open(PKL_FILENAME, 'wb') as file:
    pickle.dump(rf_model, file)

# Output the filename where the model is saved
PKL_FILENAME

# Predict on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluation using classification report
rf_report = classification_report(y_test, rf_predictions)
rf_report

'              precision    recall  f1-score   support\n\n           1       0.49      0.30      0.37       233\n           2       0.58      0.71      0.64       529\n           3       0.64      0.41      0.50       223\n           4       0.69      0.53      0.60        77\n           5       0.80      0.50      0.62       163\n           6       0.73      0.88      0.80       661\n\n    accuracy                           0.66      1886\n   macro avg       0.66      0.56      0.59      1886\nweighted avg       0.65      0.66      0.64      1886\n'

The Random Forest model produced accuracy of around 66%. it performed significantly better than the Logistic Regression model, showing improved accuracy and better handling of class imbalances.