In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn import svm
from sklearn.metrics import *

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score



## Step-by-Step Guide

**Step 1: Import Libraries and Extract Files**
- First, import the necessary libraries and extract the files from the ZIP archive. Additionally, list the contents of the extraction directory to confirm the files are present.
- If the extracted files are still in zip format, we need to unzip these individual CSV files as well. 

**os.listdir:** Lists the files in the extraction directory to confirm they were extracted correctly.

In [4]:
import os

print("Current working directory:", os.getcwd())


Current working directory: C:\Users\avary


In [6]:
import os

# List files in the current working directory
print("Files in current directory:", os.listdir(os.getcwd()))


Files in current directory: ['.anaconda', '.cisco', '.conda', '.condarc', '.continuum', '.ipynb_checkpoints', '.ipython', '.jupyter', '.matplotlib', '.ms-ad', '.virtual_documents', '3D Objects', 'anaconda3', 'AppData', 'Application Data', 'Bosch Project', 'bosch-production-line-performance.zip', 'bosch_data', 'Box', 'Char - Conceptual.csv', 'Char - Procedural.csv', 'Contacts', 'Cookies', 'Creative Cloud Files', 'Cronbach leave-one item-out.ipynb', 'cronbach_alpha_results.csv', 'cronbach_alpha_results.xlsx', 'Desktop', 'Documents', 'Downloads', 'Exp_Survey.xlsx', 'Favorites', 'Homoscedasticity.ipynb', 'Krippendorff_Alpha.xlsx', 'Links', 'Local Settings', 'MicrosoftEdgeBackups', 'Music', 'My Documents', 'NetHood', 'Normality_Cheking.py', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{578b3544-d63b-11eb-8d8f-8c7d9e0940b4}.TxR.0.regtrans-ms', 'NTUSER.DAT{578b3544-d63b-11eb-8d8f-8c7d9e0940b4}.TxR.1.regtrans-ms', 'NTUSER.DAT{578b3544-d63b-11eb-8d8f-8c7d9e0940b4}.TxR.2.regtra

In [8]:
import zipfile
import os

# Define the path to the zip file
zip_file_path = 'bosch-production-line-performance.zip'

# Check if the file exists and is not empty
if not os.path.exists(zip_file_path) or os.path.getsize(zip_file_path) == 0:
    print("The zip file does not exist or is empty. Please check the file.")
else:
    try:
        # Define the directory to extract files into
        extraction_dir = 'bosch_data'

        # Extract the zip file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extraction_dir)

        # List the extracted files
        extracted_files = os.listdir(extraction_dir)
        print("Extracted files:", extracted_files)
    except zipfile.BadZipFile:
        print("Error: The file is not a valid zip file or is corrupted.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


Extracted files: ['.ipynb_checkpoints', 'sample_submission.csv', 'sample_submission.csv.zip', 'test_categorical.csv', 'test_categorical.csv.zip', 'test_date.csv', 'test_date.csv.zip', 'test_numeric.csv', 'test_numeric.csv.zip', 'train_categorical.csv', 'train_categorical.csv.zip', 'train_date.csv', 'train_date.csv.zip', 'train_numeric.csv', 'train_numeric.csv.zip']


In [10]:

# Extract individual CSV files from their zip archives in bosch-production-line-performance.zip file
for file_name in extracted_files:
    if file_name.endswith('.zip'):
        with zipfile.ZipFile(os.path.join(extraction_dir, file_name), 'r') as zip_ref:
            zip_ref.extractall(extraction_dir)

# List files again after nested extraction
extracted_files = os.listdir(extraction_dir)
print("Files after nested extraction:", extracted_files)

# Print contents of the extraction directory
for root, dirs, files in os.walk(extraction_dir):
    for name in files:
        print(os.path.join(root, name))

Files after nested extraction: ['.ipynb_checkpoints', 'sample_submission.csv', 'sample_submission.csv.zip', 'test_categorical.csv', 'test_categorical.csv.zip', 'test_date.csv', 'test_date.csv.zip', 'test_numeric.csv', 'test_numeric.csv.zip', 'train_categorical.csv', 'train_categorical.csv.zip', 'train_date.csv', 'train_date.csv.zip', 'train_numeric.csv', 'train_numeric.csv.zip']
bosch_data\sample_submission.csv
bosch_data\sample_submission.csv.zip
bosch_data\test_categorical.csv
bosch_data\test_categorical.csv.zip
bosch_data\test_date.csv
bosch_data\test_date.csv.zip
bosch_data\test_numeric.csv
bosch_data\test_numeric.csv.zip
bosch_data\train_categorical.csv
bosch_data\train_categorical.csv.zip
bosch_data\train_date.csv
bosch_data\train_date.csv.zip
bosch_data\train_numeric.csv
bosch_data\train_numeric.csv.zip


In [12]:
import pandas as pd
import os

# Define the paths to the extracted CSV files for the test set
test_numeric_path = os.path.join(extraction_dir, 'test_numeric.csv')
test_categorical_path = os.path.join(extraction_dir, 'test_categorical.csv')

# Check if files exist and print the paths
print("test_numeric_path:", test_numeric_path)
print("test_categorical_path:", test_categorical_path)
print("test_numeric_path exists:", os.path.exists(test_numeric_path))
print("test_categorical_path exists:", os.path.exists(test_categorical_path))

# Load the first 5 rows of the numeric features for the test set
test_numeric_features = pd.read_csv(test_numeric_path, nrows=5)
print("Test Numeric Features Head (first 5 rows):")
print(test_numeric_features.head(10))

# Print the number of numeric features in the test set
print(f"\nNumber of test numeric features: {len(test_numeric_features.columns)}")
print(f"Test Numeric feature names: {test_numeric_features.columns.tolist()}")

# Print summary statistics for the numeric features in the test set
print("\nSummary statistics for test numeric features:")
print(test_numeric_features.describe())

# Load the first 5 rows of the categorical features for the test set
test_categorical_features = pd.read_csv(test_categorical_path, nrows=5)
print("\nTest Categorical Features Head (first 5 rows):")
print(test_categorical_features.head(5))

# Print the number of categorical features in the test set
print(f"\nNumber of test categorical features: {len(test_categorical_features.columns)}")
print(f"Test Categorical feature names: {test_categorical_features.columns.tolist()}")

# Print summary statistics for the categorical features in the test set
print("\nSummary statistics for test categorical features:")
print(test_categorical_features.describe())



test_numeric_path: bosch_data\test_numeric.csv
test_categorical_path: bosch_data\test_categorical.csv
test_numeric_path exists: True
test_categorical_path exists: True
Test Numeric Features Head (first 5 rows):
   Id  L0_S0_F0  L0_S0_F2  L0_S0_F4  L0_S0_F6  L0_S0_F8  L0_S0_F10  L0_S0_F12  \
0   1       NaN       NaN       NaN       NaN       NaN        NaN        NaN   
1   2       NaN       NaN       NaN       NaN       NaN        NaN        NaN   
2   3       NaN       NaN       NaN       NaN       NaN        NaN        NaN   
3   5    -0.016    -0.026    -0.033    -0.016     0.205     -0.157        0.0   
4   8       NaN       NaN       NaN       NaN       NaN        NaN        NaN   

   L0_S0_F14  L0_S0_F16  ...  L3_S50_F4243  L3_S50_F4245  L3_S50_F4247  \
0        NaN        NaN  ...           NaN           NaN           NaN   
1        NaN        NaN  ...           NaN           NaN           NaN   
2        NaN        NaN  ...           NaN           NaN           NaN   
3     

## Difference Between Inner and Left Joins

**Inner Join:

- Definition: Returns only the rows where there is a match in both datasets.
- Effect: If an Id is missing in any dataset, the corresponding rows will be excluded from the result.
- Usage: Useful when you want to ensure that only complete data points across all datasets are included.

**Left Join:

- Definition: Returns all rows from the left dataset, and the matched rows from the right dataset. If no match is found, NaN values are filled in for columns from the right dataset.
- Effect: Keeps all rows from the left dataset, regardless of whether there is a matching row in the right dataset.
- Usage: Useful when you want to retain all rows from the primary dataset (left) and add information from the secondary dataset (right) when available.

## 1. Data Preparation in Test Set: Sequence for handling missing values, and applying PCA for feature reduction

**1.1. Handle Missing Values:** First, address any missing values in dataset. This ensures that the dataset is complete and prevents                                        issues during the balancing and PCA steps.
   
   - Given the kernel keeps dying, we can implement a more memory-efficient approach by processing each chunk individually and not storing all processed chunks in memory simultaneously. We can achieve this by saving intermediate results to disk.
       
       - ***Step-by-Step Implementation***
         
         - ***1.1.1 Identify Columns to Drop:*** First, iterate over each file to identify columns with more than 40% missing values across                                                 all chunks.
         
         - ***1.1.2. Process Each Chunk Separately:*** Drop the identified columns, fill remaining missing values with zero, and save the 
                                                     processed chunks to disk.
              
               -  1.1.2.1. Given the context of the Bosch production-line-performance dataset, filling missing values with zero can be 
                           appropriate for several reasons
                            - 1.1.2.1.1. Since missing values indicate the absence of a measurement, which can logically be represented by                                          zero.
                            - 1.1.2.1.2. Consistency Across Feature Types:Since the dataset is divided into numerical, categorical, and                                              date features, applying a consistent strategy (e.g., filling with zero) can simplify the                                                    preprocessing steps and ensure uniformity.
                                       
         - ***1.1.3. Check and Remove Duplicates Before Merging:*** Verify if there are any duplicate Ids in each dataset and remove them if 
                                                                  they exist.                                       
       
         - ***1.1.4. Merge Processed Data:*** Load the processed chunks from disk and merge them using the Id column with an inner join to 
                                            ensure only complete data points are included (an Id is missing in any dataset, the 
                                            corresponding rows will be excluded from the result)..

**NO BALANCE the ClASSES in the TEST SET:** For the test set, you need to preprocess the data similarly without balancing since there is no                                             'Response' column in the test data.

**1.2. Convert Categorical Variables to Dummy Variables:** There are some non-numeric values (categoriclfeatures) in dataset that are                                                                  causing issues with the StandardScaler in PCA. These non-numeric values need to                                                            be handled before standardizing the features. one-hot encoding will be used to                                                              handle non-numeric (categorical) features. This will convert categorical                                                                    variables into a series of binary (dummy) variables.

**1.3. Apply PCA Using Training Set Transformation:** To ensure consistency between the training and test sets, I applied the same                                                                PCA transformation derived from the training set to the test set. 
                 
                 - First, save the PCA model and scaler from the training notebook to disk. Then load these in test notebook and apply the                    same transformations. In training notebook, after applying PCA to the training set, save the scaler and PCA model.


 - **After handling missing values, the remaining missing values per column are shown as"Missing values after handling:"

In [14]:
#1.1.1 Identify Columns to Drop

import pandas as pd
import os

# Define paths
test_numeric_path = os.path.join('bosch_data', 'test_numeric.csv')
test_categorical_path = os.path.join('bosch_data', 'test_categorical.csv')
# test_date_path = os.path.join('bosch_data', 'test_date.csv')

# Function to collect missing values per column in chunks
def collect_missing_values(file_path, chunk_size=1000):
    missing_counts = None
    total_rows = 0
    chunk_counter = 0

    for chunk in pd.read_csv(file_path, chunksize=chunk_size, engine='python'):
        chunk_counter += 1
        print(f"Processing chunk {chunk_counter}...")
        if missing_counts is None:
            missing_counts = chunk.isnull().sum()
        else:
            missing_counts += chunk.isnull().sum()
        total_rows += len(chunk)

    missing_proportion = missing_counts / total_rows
    return missing_proportion

# Collect missing values proportions for each file
print("Collecting missing values for numeric data...")
numeric_missing_values = collect_missing_values(test_numeric_path)
print("Collecting missing values for categorical data...")
categorical_missing_values = collect_missing_values(test_categorical_path)
# print("Collecting missing values for date data...")
# date_missing_values = collect_missing_values(test_date_path)

# Identify columns to drop (more than 40% missing values)
columns_to_drop_numeric = numeric_missing_values[numeric_missing_values > 0.4].index.tolist()
columns_to_drop_categorical = categorical_missing_values[categorical_missing_values > 0.4].index.tolist()
# columns_to_drop_date = date_missing_values[date_missing_values > 0.4].index.tolist()

print(f"Columns to drop in numeric data: {columns_to_drop_numeric}")
print(f"Columns to drop in categorical data: {columns_to_drop_categorical}")
# print(f"Columns to drop in date data: {columns_to_drop_date}")


Collecting missing values for numeric data...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Processing chunk 22...
Processing chunk 23...
Processing chunk 24...
Processing chunk 25...
Processing chunk 26...
Processing chunk 27...
Processing chunk 28...
Processing chunk 29...
Processing chunk 30...
Processing chunk 31...
Processing chunk 32...
Processing chunk 33...
Processing chunk 34...
Processing chunk 35...
Processing chunk 36...
Processing chunk 37...
Processing chunk 38...
Processing chunk 39...
Processing chunk 40...
Processing chunk 41...
Processing chunk 42.

In [15]:

#1.1.2. Process Each Chunk Separately

# Function to process each chunk and save to disk
def process_and_save_chunks(file_path, columns_to_drop, output_file, chunk_size=1000):
    if os.path.exists(output_file):
        os.remove(output_file)
    
    chunk_counter = 0
    
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, engine='python'):
        chunk_counter += 1
        print(f"Processing chunk {chunk_counter}...")

        # Drop columns with more than 40% missing values
        chunk.drop(columns=columns_to_drop, inplace=True)
        
        # Fill remaining missing values with zero
        chunk.fillna(0, inplace=True)
        
        # Save each processed chunk to disk
        chunk.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)
        print(f"Chunk {chunk_counter} processed and saved.")

# Define paths for test data
test_numeric_path = os.path.join('bosch_data', 'test_numeric.csv')
test_categorical_path = os.path.join('bosch_data', 'test_categorical.csv')
# test_date_path = os.path.join('bosch_data', 'test_date.csv')

# Process numeric data
process_and_save_chunks(test_numeric_path, columns_to_drop_numeric, 'test_numeric_processed.csv')

# Process categorical data
process_and_save_chunks(test_categorical_path, columns_to_drop_categorical, 'test_categorical_processed.csv')

# Process date data
# process_and_save_chunks(test_date_path, columns_to_drop_date, 'test_date_processed.csv')


Processing chunk 1...
Chunk 1 processed and saved.
Processing chunk 2...
Chunk 2 processed and saved.
Processing chunk 3...
Chunk 3 processed and saved.
Processing chunk 4...
Chunk 4 processed and saved.
Processing chunk 5...
Chunk 5 processed and saved.
Processing chunk 6...
Chunk 6 processed and saved.
Processing chunk 7...
Chunk 7 processed and saved.
Processing chunk 8...
Chunk 8 processed and saved.
Processing chunk 9...
Chunk 9 processed and saved.
Processing chunk 10...
Chunk 10 processed and saved.
Processing chunk 11...
Chunk 11 processed and saved.
Processing chunk 12...
Chunk 12 processed and saved.
Processing chunk 13...
Chunk 13 processed and saved.
Processing chunk 14...
Chunk 14 processed and saved.
Processing chunk 15...
Chunk 15 processed and saved.
Processing chunk 16...
Chunk 16 processed and saved.
Processing chunk 17...
Chunk 17 processed and saved.
Processing chunk 18...
Chunk 18 processed and saved.
Processing chunk 19...
Chunk 19 processed and saved.
Processing 

In [16]:

## 1.1.3. Check and Remove Duplicates Before Merging

# Load processed data
test_numeric_processed = pd.read_csv('test_numeric_processed.csv')
test_categorical_processed = pd.read_csv('test_categorical_processed.csv')
# test_date_processed = pd.read_csv('test_date_processed.csv')

# Check for duplicates in each dataset
print("Checking for duplicates in numeric data...")
numeric_duplicates = test_numeric_processed[test_numeric_processed.duplicated(subset='Id', keep=False)]
print(f"Number of duplicate Ids in numeric data: {numeric_duplicates.shape[0]}")

print("Checking for duplicates in categorical data...")
categorical_duplicates = test_categorical_processed[test_categorical_processed.duplicated(subset='Id', keep=False)]
print(f"Number of duplicate Ids in categorical data: {categorical_duplicates.shape[0]}")

#print("Checking for duplicates in date data...")
#date_duplicates = test_date_processed[test_date_processed.duplicated(subset='Id', keep=False)]
#print(f"Number of duplicate Ids in date data: {date_duplicates.shape[0]}")

# Remove duplicates if they exist
test_numeric_processed = test_numeric_processed.drop_duplicates(subset='Id')
test_categorical_processed = test_categorical_processed.drop_duplicates(subset='Id')
#test_date_processed = test_date_processed.drop_duplicates(subset='Id')



Checking for duplicates in numeric data...
Number of duplicate Ids in numeric data: 0
Checking for duplicates in categorical data...
Number of duplicate Ids in categorical data: 0


In [17]:
#1.1.4. Merge Processed Data

# Perform inner joins to avoid duplications
test_merged = test_numeric_processed.merge(test_categorical_processed, on='Id', how='inner')
# test_merged = test_merged.merge(test_date_processed, on='Id', how='inner')

# Save the merged test data to disk
test_merged.to_csv('test_merged.csv', index=False)
print("Test set merged and saved to 'test_merged.csv'.")



Test set merged and saved to 'test_merged.csv'.


## 1.2. Convert Categorical Variables to Dummy Variables

- **Load the Merged aData:** Load the dataset from the CSV file.
- **Identify Categorical Columns:** Identify columns with object type (categorical) using select_dtypes(include=['object'])..
- **Apply One-Hot Encoding:** Convert categorical variables to binary/dummy variables using pd.get_dummies()  while preserving all other                                 columns, including Id and Response.
- **Save the Data:** Save the dataset with dummy variables to a new CSV file.


In [21]:
# 1.2. Convert Categorical Variables to Dummy Variables

# Load the merged test data
merged_test_data_path = 'test_merged.csv'
test_merged = pd.read_csv(merged_test_data_path)

# Identify categorical columns
categorical_columns_test = test_merged.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns in test data: {categorical_columns_test}")

# Apply one-hot encoding to categorical columns
test_with_dummies = pd.get_dummies(test_merged, columns=categorical_columns_test)

# Save the data with dummy variables to disk
dummy_output_file_test = 'test_with_dummies.csv'
test_with_dummies.to_csv(dummy_output_file_test, index=False)

print(f"Categorical variables in test data converted to dummy variables and saved to {dummy_output_file_test}.")




Categorical columns in test data: ['L3_S29_F3317', 'L3_S29_F3320', 'L3_S29_F3323', 'L3_S29_F3326', 'L3_S29_F3329', 'L3_S29_F3332', 'L3_S29_F3335', 'L3_S29_F3338', 'L3_S29_F3341', 'L3_S29_F3344', 'L3_S29_F3347', 'L3_S29_F3350', 'L3_S29_F3353', 'L3_S29_F3356', 'L3_S29_F3359', 'L3_S29_F3362', 'L3_S29_F3364', 'L3_S29_F3366', 'L3_S29_F3369', 'L3_S29_F3372', 'L3_S29_F3375', 'L3_S29_F3378', 'L3_S29_F3381', 'L3_S29_F3384', 'L3_S29_F3387', 'L3_S29_F3390', 'L3_S29_F3392', 'L3_S29_F3394', 'L3_S29_F3397', 'L3_S29_F3400', 'L3_S29_F3403', 'L3_S29_F3406', 'L3_S29_F3409', 'L3_S29_F3411', 'L3_S29_F3414', 'L3_S29_F3416', 'L3_S29_F3418', 'L3_S29_F3420', 'L3_S29_F3423', 'L3_S29_F3426', 'L3_S29_F3429', 'L3_S29_F3432', 'L3_S29_F3435', 'L3_S29_F3438', 'L3_S29_F3441', 'L3_S29_F3444', 'L3_S29_F3446', 'L3_S29_F3448', 'L3_S29_F3451', 'L3_S29_F3454', 'L3_S29_F3457', 'L3_S29_F3460', 'L3_S29_F3463', 'L3_S29_F3466', 'L3_S29_F3469', 'L3_S29_F3472', 'L3_S29_F3475', 'L3_S29_F3478', 'L3_S29_F3481', 'L3_S29_F3484', 'L3_S

## 1.3. Apply PCA Using Training Set Transformation to Test Set

- **1.3.1. Load the Data with Dummy Variables:** Load the processed dataset with dummy variables from disk.
- **1.3.2. Separate Features and Target:** Exclude the Id and Response columns.
- **1.3.3. Standardize the Features:** Standardize the features using StandardScaler.
     - This step ensures that each feature has a mean of 0 and a standard deviation of 1. Standardization is crucial for PCA because it is         sensitive to the variances of the initial variables.
     
- **1.3.4. Apply PCA:** First, save the PCA model and scaler from the training notebook to disk. Then load these in your test notebook and                         apply the same transformations.
                          - **Training Notebook:** Save Scaler and PCA Model: In training notebook, after applying PCA to the training set,                                                    save the scaler and PCA model
                          - **Test Notebook:** Load Scaler and PCA Model, and Apply to Test Set. In test notebook, load saved models from                                                  traing dataset and apply the same transformations to the test set:
                        
                        - The PCA model is initialized with n_components=0.95. This parameter means that PCA will select the number of                                 principal components needed to explain 95% of the variance in the data and then projects the original features                             onto these components.
     
- **1.3.5. Create a DataFrame with PCA Components:** Create a DataFrame for PCA components and add Id and Response columns.

- **1.3.6. Save the PCA-Transformed Data:** Save the PCA-transformed data to a new CSV file.

- **1.3.7. Display Resulting Dimensions:** Print the number of features and data points after PCA

In [28]:
## 1.3. Apply PCA Using Training Set Transformation to Test Set

# Test Notebook: Load Scaler and PCA Model, and Apply to Test Set

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import joblib

# Load the scaler and PCA model from disk
scaler = joblib.load('scaler.pkl')
pca = joblib.load('pca_model.pkl')

# Load the data with dummy variables
dummy_data_path_test = 'test_with_dummies.csv'
test_with_dummies = pd.read_csv(dummy_data_path_test)

# Separate features and Id
X_test = test_with_dummies.drop(columns=['Id'])

# Standardize the features using the scaler fitted on the training set
X_test_scaled = scaler.transform(X_test)

# Apply PCA using the PCA model fitted on the training set
X_test_pca = pca.transform(X_test_scaled)

# Add the Id column back
test_pca = pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(X_test_pca.shape[1])])
test_pca['Id'] = test_with_dummies['Id']

# Save the PCA-transformed test data to disk
test_pca_output_file = 'test_pca.csv'
test_pca.to_csv(test_pca_output_file, index=False)
print("PCA applied to test set and saved to 'test_pca.csv'.")


PCA applied to test set and saved to 'test_pca.csv'.


In [31]:
# Display resulting dimensions

pca_columns = [f'PC{i+1}' for i in range(X_test_pca.shape[1])]

print(f"PCA applied and data saved to {test_pca_output_file}.")
print(f"Number of PCA components: {len(pca_columns)}")
print(f"Number of data points: {test_pca.shape[0]}")

# Display first few rows of the transformed dataset
print("First few rows of the PCA-transformed test dataset:")
print(test_pca.head())

PCA applied and data saved to test_pca.csv.
Number of PCA components: 49
Number of data points: 1183748
First few rows of the PCA-transformed test dataset:
         PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0  15.552314 -1.818873 -0.360482  1.445788  0.888461 -2.646956 -5.962566   
1  -7.987294 -1.165207  2.660707 -4.743067 -0.058247 -2.833245  1.265343   
2  15.655985 -0.815389 -1.017637 -1.553818 -3.527188  2.561696 -4.347554   
3  -8.292817 -0.492141 -1.945309 -3.098645  2.476640  3.332903 -0.839789   
4  15.550483 -0.436298  2.412266  7.797190  0.189427 -0.999051 -2.991250   

        PC8       PC9      PC10  ...      PC41      PC42      PC43      PC44  \
0 -1.062323 -1.620136  0.856586  ... -0.267045 -0.240785 -0.025074  0.038500   
1  1.981759 -2.362058 -0.241490  ... -0.240384  0.053451 -0.302892  0.511881   
2 -1.190514 -1.521061 -2.599752  ... -0.508024  0.088889 -0.057104  0.707529   
3  1.212562 -0.432397  1.888737  ... -0.307118  0.063394 -0.248014 

# Sample the Test Set for Final Model Evaluation 

**To reduce the number of data points in the test set to 20% of the training dataset size, I randomly got sample from the test dataset. Given that the training dataset has 13,758 data points, 20% of this would be approximately 2,752 data points.

In [36]:
import pandas as pd

# Load the PCA-transformed test data
test_pca_path = 'test_pca.csv'
test_pca = pd.read_csv(test_pca_path)

# Calculate the number of samples needed (20% of the training data size)
num_samples = int(0.2 * 13758)

# Randomly sample the required number of data points from the test set
test_sampled = test_pca.sample(n=num_samples, random_state=42)

# Save the sampled test set to a new CSV file
test_sampled_path = 'test_sampled.csv'
test_sampled.to_csv(test_sampled_path, index=False)

print(f"Sampled {num_samples} data points from the test set and saved to {test_sampled_path}")


Sampled 2751 data points from the test set and saved to test_sampled.csv
