In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
import pandas as pd
import glob
import os

# Paths to folders and their corresponding output values
folder_mappings = {
    '/kaggle/input/im-fault/0.5mm': 0,
    '/kaggle/input/im-fault/1.0mm': 0,
    '/kaggle/input/im-fault/1.5mm': 1,
    '/kaggle/input/im-fault/2.0mm': 1,
    '/kaggle/input/im-fault/normal': 0
}

# Parameters for downsampling
sample_fraction = 0.1  # Retain 10% of rows from each file

# Initialize an empty list to store DataFrames
dataframes = []

# Process each folder and its CSV files
for folder, output_value in folder_mappings.items():
    csv_files = glob.glob(os.path.join(folder, '*.csv'))
    
    for file in csv_files:
        # Read the file without assuming headers
        df = pd.read_csv(file, header=None)
        
        # Downsample the file
        downsampled_df = df.sample(frac=sample_fraction, random_state=42)
        
        # Add the output column
        downsampled_df['output'] = output_value
        
        # Append the modified DataFrame to the list
        dataframes.append(downsampled_df)

# Concatenate all the downsampled DataFrames
merged_df = pd.concat(dataframes, ignore_index=True)

# Add custom header columns
merged_df.columns = [
    'rotation freq', 'uh_ax_vib', 'uh_rd_vib', 'uh_tg_vib',
    'oh_ax_vib', 'oh_rd_vib', 'oh_tg_vib', 'microphone', 'output'
]

In [7]:

from scipy.fft import fft
import numpy as np
import pandas as pd

# Function to apply FFT to a specific column in the DataFrame
def apply_fft_scipy(df, column_name):
    # Perform FFT on the column data
    fft_values = fft(df[column_name])
    # Return only the magnitude of the FFT result
    return np.abs(fft_values)

# Columns to apply FFT (excluding the 'output' column)
data_columns = [
    'rotation freq', 'uh_ax_vib', 'uh_rd_vib', 'uh_tg_vib',
    'oh_ax_vib', 'oh_rd_vib', 'oh_tg_vib', 'microphone'
]

# Create an empty dictionary to store FFT results
fft_data = {}

# Apply FFT to each column and store in the dictionary
for col in data_columns:
    fft_data[f'{col}_fft'] = apply_fft_scipy(merged_df, col)[:len(merged_df)]

# Convert the dictionary to a new DataFrame
fft_df = pd.DataFrame(fft_data)

# Copy the 'output' column to the new DataFrame
fft_df['output'] = merged_df['output']

# Check the FFT DataFrame
print(fft_df.head())


   rotation freq_fft  uh_ax_vib_fft  uh_rd_vib_fft  uh_tg_vib_fft  \
0        1004.343013   68804.753690     292.980677    3158.538155   
1        3269.751824   13532.195716     460.327254     985.479991   
2        6881.740197    4501.487223      28.316261    1064.720351   
3        5881.330951    1959.230291    1579.416536     433.786726   
4        4218.824797    1956.575544     687.398252     872.787272   

   oh_ax_vib_fft  oh_rd_vib_fft  oh_tg_vib_fft  microphone_fft  output  
0   71700.382050   18791.174820   66569.925957    80609.907232       0  
1    6845.204759      78.532673   30839.765520     3903.642134       0  
2   14313.868282    1017.411608   15442.677752      446.887554       0  
3    7180.536977     165.598974    2747.267940      516.773715       0  
4   11647.841365     280.065261   23563.294336      867.180183       0  


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Assuming merged_df is your dataframe
# Separate features (X) and target (y)
X = fft_df.drop('output', axis=1)  # Replace 'target_column' with the actual name
y = merged_df['output']

# Encoding categorical features if they exist in X
X = X.apply(LabelEncoder().fit_transform)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
}

# Initialize variables to store the best model and score
best_model = None
best_score = 0

# Loop through models and evaluate them
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict and evaluate the model
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    
    print(f'{model_name} Accuracy: {score:.4f}')
    
    # Update the best model if necessary
    if score > best_score:
        best_score = score
        best_model = model_name

    print(f'\nModel: {best_model} with Accuracy: {best_score:.4f}')


Logistic Regression Accuracy: 0.6014

Model: Logistic Regression with Accuracy: 0.6014
