# Data Preprocessing

 **Loading the Data**
   - Uploaded the dataset `signal-data.csv` into the environment using Google Colab

In [None]:
from google.colab import files
uploaded = files.upload()


Saving signal-data.csv to signal-data.csv


In [None]:
import pandas as pd
data = pd.read_csv('signal-data.csv')


 # Handling Missing Values
- **Identification**:
  Columns with missing values were identified to understand the extent of the issue in the dataset.
  
- **Imputation**:
  - **Numerical columns**: Missing values were replaced with the median to ensure robustness against outliers.
  - **Categorical columns**: Missing values were replaced with the mode to retain the most common value.

- The cleaned dataset was saved for further processing.

In [None]:
missing_values=data.isnull().sum()
print(missing_values[missing_values>0])

0       6
1       7
2      14
3      14
4      14
       ..
585     1
586     1
587     1
588     1
589     1
Length: 538, dtype: int64


In [None]:
for col in data.columns:
    if data[col].dtype in ['float64', 'int64']:  # Numerical columns
        data[col] = data[col].fillna(data[col].median())  # Use explicit assignment
    elif data[col].dtype == 'object':  # Categorical columns
        data[col] = data[col].fillna(data[col].mode()[0])  # Use explicit assignment


In [None]:
data.to_csv('cleaned_data.csv', index=False)


 # Removing Duplicate Records
- **Duplicate Check**:
  The dataset was examined for duplicate records to avoid redundancy in the data.
  
- **Duplicates Found**:
  The number of duplicate records was noted.
  
- **Action Taken**:
  Duplicate records were removed earlier in the preprocessing pipeline.

In [None]:
duplicate_count = data.duplicated().sum()
print(f"Number of duplicate records: {duplicate_count}")


Number of duplicate records: 0


In [None]:
print(data.dtypes)  # Look for 'datetime64[ns]' or similar data types


Time          object
0            float64
1            float64
2            float64
3            float64
              ...   
586          float64
587          float64
588          float64
589          float64
Pass/Fail      int64
Length: 592, dtype: object


In [None]:
print(data.columns)


Index(['Time', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '581', '582', '583', '584', '585', '586', '587', '588', '589',
       'Pass/Fail'],
      dtype='object', length=592)


# Time-Based Features
- **Datetime Conversion**:
  A time column was converted to datetime format to enable time-based feature extraction.
  
- **New Features Created**:
  - **Day of the Week**: Derived the day name from the datetime column to analyze weekly patterns.
  - **Hour**: Extracted the hour to capture hourly trends.
  - **Elapsed Time**: Calculated the total seconds elapsed since the earliest timestamp to create a relative time feature.

- These features were added to the dataset to enrich the information for analysis.


In [None]:
data['Time'] = pd.to_datetime(data['Time'], errors='coerce')


In [None]:
new_columns = pd.DataFrame({
    'day_of_week': data['Time'].dt.day_name()
})
data = pd.concat([data, new_columns], axis=1)


In [None]:
data = data.copy()
data['day_of_week'] = data['Time'].dt.day_name()


In [None]:
data['Time'] = pd.to_datetime(data['Time'], errors='coerce')
data['day_of_week'] = data['Time'].dt.day_name()


In [None]:
data.to_csv('processed_data.csv', index=False)


In [None]:
data['hour'] = data['Time'].dt.hour


In [None]:
# Check for duplicate records
duplicates = data.duplicated()

# Count the number of duplicates
print(f"Number of duplicate rows: {duplicates.sum()}")

# Preview the duplicate rows, if any
print(data[duplicates])


Number of duplicate rows: 0
Empty DataFrame
Columns: [Time, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, ...]
Index: []

[0 rows x 595 columns]


In [None]:
data['elapsed_time'] = (data['Time'] - data['Time'].min()).dt.total_seconds()


In [None]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,585,586,587,588,589,Pass/Fail,day_of_week,day_of_week.1,hour,elapsed_time
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,2.363,0.0205,0.0148,0.0046,71.9005,-1,Saturday,Saturday,11,16710780.0
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,4.4447,0.0096,0.0201,0.006,208.2045,-1,Saturday,Saturday,12,16713000.0
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,3.1745,0.0584,0.0484,0.0148,82.8602,1,Saturday,Saturday,13,16715700.0
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,2.0544,0.0202,0.0149,0.0044,73.8432,-1,Saturday,Saturday,14,16720860.0
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,99.3032,0.0202,0.0149,0.0044,73.8432,-1,Saturday,Saturday,15,16723200.0


# Normalization of Numerical Features
- **Feature Scaling**:
  - Numerical columns were scaled to ensure all features had comparable ranges.
  - Min-Max scaling was applied to normalize values between 0 and 1.

- Scaling ensures that all features contribute equally in machine learning models.

In [None]:
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [None]:
data.to_csv('preprocessed_data.csv', index=False)
