In [18]:
import pandas as pd

file_path = 'relevant_features_dataset.csv'  # Path to the file
df_relevant = pd.read_csv(file_path)
# Print the first 5 rows
print("First 5 rows of the dataset:")
print(df_relevant.head())

First 5 rows of the dataset:
  I_see_myself_as_someone_who_is_anxious_easily_upset  \
0                                   agree moderately    
1                                     agree a little    
2                                   agree moderately    
3                                  disagree strongly    
4                                disagree moderately    

  I_see_myself_as_someone_who_is_reserved_quiet Do_you_have_close_friends  \
0                                agree a little                       yes   
1                             disagree strongly                       yes   
2                              agree moderately                       yes   
3                           disagree moderately                       yes   
4                                agree strongly                       yes   

  I_see_myself_as_someone_who_is_calm_emotionally_stable Gender  \
0                                     agree a little       male   
1                              

### Step 1: Encode Categorical Variables
- **Why**: Machine learning models require numeric inputs, so we convert categorical variables.
- **What**:
  - Use one-hot encoding for non-ordinal categorical features.
  - This ensures each category is represented as a binary column.


In [19]:
# Step 1: Separate Target Column
target_column = "How_often_do_you_feel_stressed"
y = df_relevant[target_column]  # Extract the target variable
X = df_relevant.drop(columns=[target_column])
# Print the first 5 rows
print("First 5 rows of the dataset:")
print(df_relevant.head())

First 5 rows of the dataset:
  I_see_myself_as_someone_who_is_anxious_easily_upset  \
0                                   agree moderately    
1                                     agree a little    
2                                   agree moderately    
3                                  disagree strongly    
4                                disagree moderately    

  I_see_myself_as_someone_who_is_reserved_quiet Do_you_have_close_friends  \
0                                agree a little                       yes   
1                             disagree strongly                       yes   
2                              agree moderately                       yes   
3                           disagree moderately                       yes   
4                                agree strongly                       yes   

  I_see_myself_as_someone_who_is_calm_emotionally_stable Gender  \
0                                     agree a little       male   
1                              

### Step 2: Scale Numerical Features
- **Why**: Numerical features often have different ranges, which can bias models sensitive to magnitude (e.g., SVM, KNN).
- **What**:
  - Standardize numeric features to have a mean of 0 and a standard deviation of 1.


In [20]:
# Step 2: Encode Categorical Variables in Features Only
categorical_cols = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
# Print the first 5 rows
print("First 5 rows of the dataset:")
print(df_relevant.head())


First 5 rows of the dataset:
  I_see_myself_as_someone_who_is_anxious_easily_upset  \
0                                   agree moderately    
1                                     agree a little    
2                                   agree moderately    
3                                  disagree strongly    
4                                disagree moderately    

  I_see_myself_as_someone_who_is_reserved_quiet Do_you_have_close_friends  \
0                                agree a little                       yes   
1                             disagree strongly                       yes   
2                              agree moderately                       yes   
3                           disagree moderately                       yes   
4                                agree strongly                       yes   

  I_see_myself_as_someone_who_is_calm_emotionally_stable Gender  \
0                                     agree a little       male   
1                              

### Step 3: Define Features and Target
- **Why**: Separate the data into:
  - `X`: Features used to predict the target.
  - `y`: Target variable (stress levels).
- **What**:
  - Drop the target column from features.


In [12]:
# Step 3: Scale numerical features
from sklearn.preprocessing import StandardScaler

# Identify numeric columns in the feature set
numeric_cols = X_encoded.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X_encoded[numeric_cols] = scaler.fit_transform(X_encoded[numeric_cols])

# Print examples after scaling
print("Examples after scaling numerical features:")
print(X_encoded[numeric_cols].head())


Examples after scaling numerical features:
   How_old_were_you_the_first_time_you_smoked_a_full_cigarette_not_just_a_few_puffs  \
0                                          -0.081654                                  
1                                          -0.103493                                  
2                                          -0.081654                                  
3                                          -0.088934                                  
4                                          -0.074375                                  

   How_old_are_you  \
0         1.907435   
1        -0.891016   
2         0.508209   
3        -0.541210   
4        -0.541210   

   Including_yourself_how_many_people_currently_live_in_your_household  \
0                                          -0.059534                     
1                                          -0.059534                     
2                                          -0.059534                     
3    

In [14]:
# Check the class distribution in the target variable
print("Class distribution in target variable:")
print(y.value_counts())


Class distribution in target variable:
How_often_do_you_feel_stressed
frequently      88
occasionally    64
constantly      31
rarely          28
never            1
Name: count, dtype: int64


In [15]:

# Remove rows with the rare class "never"
rare_class = 'never'
X_encoded = X_encoded[y != rare_class]
y = y[y != rare_class]

In [21]:
# Save the preprocessed data (features and target) to separate files
X_encoded_file_path = 'preprocessed_features.csv'
y_file_path = 'preprocessed_target.csv'

# Save the features and target as CSV files
X_encoded.to_csv(X_encoded_file_path, index=False)
y.to_csv(y_file_path, index=False)

