Check Header

In [8]:
import os
import pandas as pd

# Define the path where your CSV files are located
folder_path = 'hardware_dataset/'  # Change this to your actual path

# Initialize dictionaries to store counts
company_model_counts = {
    'HGST': 0,
    'Seagate': 0,
    'Toshiba': 0,
    'Western Digital': 0
}
company_unique_models = {
    'HGST': set(),
    'Seagate': set(),
    'Toshiba': set(),
    'Western Digital': set()
}
failing_drives = {
    'HGST': 0,
    'Seagate': 0,
    'Toshiba': 0,
    'Western Digital': 0
}

# Loop through all CSV files in the folder
for file_name in sorted(os.listdir(folder_path)):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        
        for chunk in pd.read_csv(file_path, usecols=['model', 'failure'], chunksize=100000):
            for index, row in chunk.iterrows():
                model = row['model']
                failure = row['failure']
                
                # Classify the model based on company and increment counts
                if 'HGST' in model:
                    company_model_counts['HGST'] += 1
                    company_unique_models['HGST'].add(model)
                    if failure == 1:
                        failing_drives['HGST'] += 1
                elif 'ST' in model:  # Assuming 'ST' corresponds to Seagate models
                    company_model_counts['Seagate'] += 1
                    company_unique_models['Seagate'].add(model)
                    if failure == 1:
                        failing_drives['Seagate'] += 1
                elif 'TOSHIBA' in model:
                    company_model_counts['Toshiba'] += 1
                    company_unique_models['Toshiba'].add(model)
                    if failure == 1:
                        failing_drives['Toshiba'] += 1
                elif 'WDC' in model:  # Assuming 'WDC' corresponds to Western Digital
                    company_model_counts['Western Digital'] += 1
                    company_unique_models['Western Digital'].add(model)
                    if failure == 1:
                        failing_drives['Western Digital'] += 1

# Prepare the summary table
summary_data = {
    'Company': ['HGST', 'Seagate', 'Toshiba', 'Western Digital'],
    'Total Hard Drive Count': [
        company_model_counts['HGST'],
        company_model_counts['Seagate'],
        company_model_counts['Toshiba'],
        company_model_counts['Western Digital']
    ],
    'Unique Models': [
        len(company_unique_models['HGST']),
        len(company_unique_models['Seagate']),
        len(company_unique_models['Toshiba']),
        len(company_unique_models['Western Digital'])
    ],
    'Failing Hard Drives': [
        failing_drives['HGST'],
        failing_drives['Seagate'],
        failing_drives['Toshiba'],
        failing_drives['Western Digital']
    ]
}

# Convert to DataFrame and calculate percentage of failure
summary_df = pd.DataFrame(summary_data)
summary_df['Percentage of Failure'] = summary_df['Failing Hard Drives'] / summary_df['Total Hard Drive Count']

# Display the summary table
print(summary_df)

# Optionally, save the result to a CSV file
summary_df.to_csv('hardware_dataset/company_failure_summary.csv', index=False)

           Company  Total Hard Drive Count  Unique Models  \
0             HGST                 3063470             14   
1          Seagate                 8641359             31   
2          Toshiba                13281320             10   
3  Western Digital                 3830231              9   

   Failing Hard Drives  Percentage of Failure  
0                  275               0.000090  
1                  486               0.000056  
2                  442               0.000033  
3                   79               0.000021  


In [11]:
import os
import pandas as pd

# Define the path where your CSV files are located
folder_path = 'hardware_dataset/'  # Change this to your actual path

# Initialize dictionary to store model counts for Seagate
seagate_model_data = {}

# Loop through all CSV files in the folder
for file_name in sorted(os.listdir(folder_path)):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        
        for chunk in pd.read_csv(file_path, usecols=['model', 'failure'], chunksize=100000):
            for index, row in chunk.iterrows():
                model = row['model']
                failure = row['failure']
                
                # Focus only on Seagate models (assuming Seagate models contain 'ST')
                if 'ST' in model:
                    if model not in seagate_model_data:
                        seagate_model_data[model] = {'failures': 0, 'successes': 0}
                    
                    if failure == 1:
                        seagate_model_data[model]['failures'] += 1
                    else:
                        seagate_model_data[model]['successes'] += 1

# Convert the dictionary to a DataFrame
seagate_df = pd.DataFrame.from_dict(seagate_model_data, orient='index')
seagate_df.reset_index(inplace=True)
seagate_df.columns = ['Model', 'Number of Failures', 'Number of Successes']

# Rank models by number of failures in descending order
seagate_df = seagate_df.sort_values(by='Number of Failures', ascending=False)

# Select the top 5 models
top_5_seagate_models = seagate_df.head(5)

# Display the top 5 Seagate models
print(top_5_seagate_models)


                   Model  Number of Failures  Number of Successes
37  HGST HUH721212ALN604                 174               841712
24         ST12000NM0008                 105              1545937
20          ST8000NM0055                  85              1103637
29  HGST HUH721212ALE604                  81              1051127
5          ST16000NM001G                  62              2641632


In [12]:
import pandas as pd
import os

# Set folder path and initialize an empty DataFrame
folder_path = "hardware_dataset"
file_list = sorted([file for file in os.listdir(folder_path) if file.endswith('.csv')])

# Master file path for output
master_file_path = os.path.join(folder_path, 'ST12000NM0008_masterfile.csv')

# Create an empty DataFrame to store filtered rows
master_df = pd.DataFrame()

# Iterate through each file
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    try:
        # Try reading the file
        df = pd.read_csv(file_path)
    except pd.errors.EmptyDataError:
        # Handle empty files
        print(f"Empty file found: {file_name}, skipping...")
        continue
    except Exception as e:
        # Handle other potential errors
        print(f"Error reading file {file_name}: {e}")
        continue
    
    # Filter rows where 'model' is 'ST12000NM0008'
    filtered_df = df[df['model'] == 'ST12000NM0008']
    
    # Append the filtered rows to the master DataFrame
    master_df = pd.concat([master_df, filtered_df], ignore_index=True)

# Save the master DataFrame to a new CSV file if any rows are found
if not master_df.empty:
    master_df.to_csv(master_file_path, index=False)
    print(f"Master file saved at: {master_file_path}")
else:
    print("No data matching 'ST12000NM0008' was found.")

Master file saved at: hardware_dataset\ST12000NM0008_masterfile.csv


In [17]:
import pandas as pd

# Set the path to the master file
master_file_path = 'hardware_dataset/ST12000NM0008_masterfile.csv'

# Load the CSV file
try:
    master_df = pd.read_csv(master_file_path)
    
    # Drop empty columns
    master_df_cleaned = master_df.dropna(axis=1, how='all')
    
    # Drop the specified columns
    columns_to_remove = ['date', 'serial_number', 'datacenter', 'cluster_id', 'vault_id', 
                         'pod_id', 'pod_slot_num', 'is_legacy_format']
    
    master_df_cleaned = master_df_cleaned.drop(columns=columns_to_remove, errors='ignore')
    
    # Dynamically remove columns that contain the word "normalized"
    columns_to_drop = [col for col in master_df_cleaned.columns if 'normalized' in col]
    master_df_cleaned = master_df_cleaned.drop(columns=columns_to_drop, errors='ignore')

    # Save the cleaned file
    cleaned_file_path = 'hardware_dataset/ST12000NM0008_masterfile_cleaned.csv'
    master_df_cleaned.to_csv(cleaned_file_path, index=False)
    
    # Get column count
    column_count = master_df_cleaned.shape[1]

    print(f"Cleaned file saved at: {cleaned_file_path}")
    print(f"Number of rows in the cleaned master file: {master_df_cleaned.shape[0]}")
    print(f"Number of columns in the cleaned master file: {column_count}")

except FileNotFoundError:
    print(f"File not found: {master_file_path}")
except pd.errors.EmptyDataError:
    print("The file is empty.")

Cleaned file saved at: hardware_dataset/ST12000NM0008_masterfile_cleaned.csv
Number of rows in the cleaned master file: 1546042
Number of columns in the cleaned master file: 26


In [44]:
# Empty Column
import pandas as pd

# Set the path to the master file
master_file_path = 'hardware_dataset/ST12000NM0008_masterfile.csv'

# Load the dataset
try:
    master_df = pd.read_csv(master_file_path)

    empty_columns = master_df.columns[master_df.isnull().all()]

    if not empty_columns.empty:
        print("Empty columns:\n", empty_columns)
    else:
        print("No empty columns found in the dataset.")

except FileNotFoundError:
    print(f"File not found: {master_file_path}")
except pd.errors.EmptyDataError:
    print("The file is empty.")

Empty columns:
 Index(['smart_2_normalized', 'smart_2_raw', 'smart_8_normalized',
       'smart_8_raw', 'smart_11_normalized', 'smart_11_raw',
       'smart_13_normalized', 'smart_13_raw', 'smart_15_normalized',
       'smart_15_raw',
       ...
       'smart_250_normalized', 'smart_250_raw', 'smart_251_normalized',
       'smart_251_raw', 'smart_252_normalized', 'smart_252_raw',
       'smart_254_normalized', 'smart_254_raw', 'smart_255_normalized',
       'smart_255_raw'],
      dtype='object', length=140)


In [27]:
# Clear Missing Value

import pandas as pd

# Set the path to the master file
master_file_path = 'hardware_dataset/ST12000NM0008_masterfile_cleaned.csv'

# Load the dataset
try:
    master_df = pd.read_csv(master_file_path)

    # Check for missing values in each row and remove rows with any missing values
    master_df_cleaned = master_df.dropna()

    # Save the cleaned file
    cleaned_file_path = 'hardware_dataset/ST12000NM0008_masterfile_no_missing.csv'
    master_df_cleaned.to_csv(cleaned_file_path, index=False)

    # Output the number of rows removed and the number of rows left
    rows_removed = master_df.shape[0] - master_df_cleaned.shape[0]
    print(f"Cleaned file saved at: {cleaned_file_path}")
    print(f"Number of rows removed due to missing values: {rows_removed}")
    print(f"Number of rows remaining in the cleaned file: {master_df_cleaned.shape[0]}")

except FileNotFoundError:
    print(f"File not found: {master_file_path}")
except pd.errors.EmptyDataError:
    print("The file is empty.")


Cleaned file saved at: hardware_dataset/ST12000NM0008_masterfile_no_missing.csv
Number of rows removed due to missing values: 109
Number of rows remaining in the cleaned file: 1545933


In [21]:
import pandas as pd

# Set the path to the master file
master_file_path = 'hardware_dataset/ST12000NM0008_masterfile_cleaned.csv'

# Load the CSV file
try:
    master_df = pd.read_csv(master_file_path)
    
    # Count the number of 0s and 1s in the 'failure' column
    failure_counts = master_df['failure'].value_counts()

    # Print the counts
    print("Count of 0s and 1s in the 'failure' column:")
    print(failure_counts)
    
except FileNotFoundError:
    print(f"File not found: {master_file_path}")
except pd.errors.EmptyDataError:
    print("The file is empty.")
except ValueError as e:
    print(e)

Count of 0s and 1s in the 'failure' column:
failure
0    1545937
1        105
Name: count, dtype: int64


In [3]:
import os
import pandas as pd

# Define the folder path
folder_path = 'hardware_dataset/'

# Define the updated SMART attributes to be predicted
smart_attributes = [
    'capacity_bytes', 'smart_1_raw', 'smart_3_raw', 'smart_4_raw', 'smart_5_raw', 'smart_7_raw',
    'smart_9_raw', 'smart_10_raw', 'smart_12_raw', 'smart_18_raw', 'smart_187_raw', 
    'smart_188_raw', 'smart_190_raw', 'smart_192_raw', 'smart_193_raw', 'smart_194_raw', 
    'smart_195_raw', 'smart_197_raw', 'smart_198_raw', 'smart_199_raw', 'smart_200_raw',
    'smart_240_raw', 'smart_241_raw', 'smart_242_raw'
]

# Initialize dictionaries to store the min and max values for each attribute
min_values = {attr: float('inf') for attr in smart_attributes}
max_values = {attr: float('-inf') for attr in smart_attributes}

# Loop through all CSV files in the folder
for file_name in sorted(os.listdir(folder_path)):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        
        # Load the file into a DataFrame
        try:
            df = pd.read_csv(file_path, usecols=smart_attributes)
            
            # Update the min and max values for each attribute
            for attr in smart_attributes:
                min_values[attr] = min(min_values[attr], df[attr].min())
                max_values[attr] = max(max_values[attr], df[attr].max())
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

# Combine the results into a DataFrame for display
min_max_df = pd.DataFrame({
    'Attribute': smart_attributes,
    'Min Value': [min_values[attr] for attr in smart_attributes],
    'Max Value': [max_values[attr] for attr in smart_attributes]
})

print(min_max_df)

         Attribute  Min Value     Max Value
0   capacity_bytes       -1.0  2.200097e+13
1      smart_1_raw        0.0  4.294967e+09
2      smart_3_raw        0.0  1.118200e+04
3      smart_4_raw        1.0  2.517300e+04
4      smart_5_raw        0.0  6.553400e+04
5      smart_7_raw        0.0  2.143883e+13
6      smart_9_raw        0.0  9.791200e+04
7     smart_10_raw        0.0  1.310720e+05
8     smart_12_raw        0.0  1.225600e+04
9     smart_18_raw        0.0  0.000000e+00
10   smart_187_raw        0.0  6.553500e+04
11   smart_188_raw        0.0  2.207647e+12
12   smart_190_raw       13.0  6.500000e+01
13   smart_192_raw        0.0  1.856850e+05
14   smart_193_raw        1.0  1.493006e+06
15   smart_194_raw       13.0  6.500000e+01
16   smart_195_raw        0.0  2.441406e+08
17   smart_197_raw        0.0  2.202880e+05
18   smart_198_raw        0.0  2.202880e+05
19   smart_199_raw        0.0  5.095000e+04
20   smart_200_raw        0.0  8.168813e+06
21   smart_240_raw        0.0  9

In [33]:
import pandas as pd

# Load the dataset
df = pd.read_csv('hardware_dataset/ST12000NM0008_masterfile_no_missing.csv')

# Drop columns that are not needed (you can specify more here if required)
df_cleaned = df.drop(columns=['model'])

# Calculate correlation with the target variable 'failure'
correlation_matrix = df_cleaned.corr()
correlation_with_failure = correlation_matrix['failure'].sort_values(ascending=False)

# Display the correlation of each feature with 'failure'
print("Correlation with Failure:\n", correlation_with_failure)

Correlation with Failure:
 failure           1.000000
smart_198_raw     0.126822
smart_197_raw     0.126822
smart_5_raw       0.027841
smart_188_raw     0.015305
smart_187_raw     0.007946
smart_193_raw     0.001030
smart_194_raw     0.000783
smart_190_raw     0.000783
smart_195_raw     0.000576
smart_1_raw       0.000576
smart_242_raw     0.000195
smart_7_raw       0.000132
smart_192_raw     0.000077
smart_199_raw     0.000018
smart_12_raw     -0.000183
smart_9_raw      -0.000321
smart_241_raw    -0.000358
smart_240_raw    -0.000466
smart_4_raw      -0.000599
capacity_bytes         NaN
smart_3_raw            NaN
smart_10_raw           NaN
smart_18_raw           NaN
smart_200_raw          NaN
Name: failure, dtype: float64


Model Training

Decision Tree

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('hardware_dataset/ST12000NM0008_masterfile_no_missing.csv')

# Select features and target
X = df[['smart_198_raw', 'smart_197_raw', 'smart_5_raw', 'smart_188_raw', 'smart_187_raw',
        'smart_193_raw', 'smart_190_raw', 'smart_194_raw', 'smart_1_raw', 'smart_195_raw']]
y = df['failure']

# Step 1: Random undersampling to reduce the number of successful instances to around 20,000
undersampler = RandomUnderSampler(sampling_strategy={0: 20000}, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Step 2: SMOTE to oversample the failure instances to 10,000
smote = SMOTE(sampling_strategy={1: 10000}, random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_resampled, y_resampled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Step 3: Train a decision tree classifier
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train, y_train)


In [36]:
# Make predictions on the test set
y_pred = decision_tree_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 99.07%
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3998
           1       0.99      0.99      0.99      2002

    accuracy                           0.99      6000
   macro avg       0.99      0.99      0.99      6000
weighted avg       0.99      0.99      0.99      6000



In [37]:
# Save the trained model to a file for future use
import joblib
joblib.dump(decision_tree_model, 'decision_tree_model.pkl')

['decision_tree_model.pkl']

Logistic Regression

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('hardware_dataset/ST12000NM0008_masterfile_no_missing.csv')

# Select features and target
X = df[['smart_198_raw', 'smart_197_raw', 'smart_5_raw', 'smart_188_raw', 'smart_187_raw',
        'smart_193_raw', 'smart_190_raw', 'smart_194_raw', 'smart_1_raw', 'smart_195_raw']]
y = df['failure']

# Step 1: Random undersampling to reduce the number of successful instances to around 20,000
undersampler = RandomUnderSampler(sampling_strategy={0: 20000}, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Step 2: SMOTE to oversample the failure instances to 10,000
smote = SMOTE(sampling_strategy={1: 10000}, random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_resampled, y_resampled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Step 3: Train a Logistic Regression model
logistic_regression_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

In [39]:
# Make predictions on the test data
y_pred = logistic_regression_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 84.22%
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.98      0.89      3998
           1       0.95      0.56      0.70      2002

    accuracy                           0.84      6000
   macro avg       0.88      0.77      0.80      6000
weighted avg       0.86      0.84      0.83      6000



AdaBoost Decision Tree

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('hardware_dataset/ST12000NM0008_masterfile_no_missing.csv')

# Select features and target
X = df[['smart_198_raw', 'smart_197_raw', 'smart_5_raw', 'smart_188_raw', 'smart_187_raw',
        'smart_193_raw', 'smart_190_raw', 'smart_194_raw', 'smart_1_raw', 'smart_195_raw']]
y = df['failure']

# Step 1: Random undersampling to reduce the number of successful instances to around 20,000
undersampler = RandomUnderSampler(sampling_strategy={0: 20000}, random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Step 2: SMOTE to oversample the failure instances to 10,000
smote = SMOTE(sampling_strategy={1: 10000}, random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_resampled, y_resampled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Step 3: Initialize the Decision Tree as the base estimator for AdaBoost
base_estimator = DecisionTreeClassifier(max_depth=1, random_state=42)  # Weak learner with depth=1

# Initialize AdaBoost with the Decision Tree base estimator
adaboost_model = AdaBoostClassifier(estimator=base_estimator, n_estimators=100, random_state=42)

# Train the AdaBoost model
adaboost_model.fit(X_train, y_train)



In [43]:
# Make predictions on the test data
y_pred = adaboost_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 97.38%
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      3998
           1       0.97      0.95      0.96      2002

    accuracy                           0.97      6000
   macro avg       0.97      0.97      0.97      6000
weighted avg       0.97      0.97      0.97      6000

