In [1]:
# Import necessary modules
import data_preprocessor as dp
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Load the dataset
messy_data = pd.read_csv('../Data/messy_data.csv')
clean_data = messy_data.copy()



# 2. Preprocess the data
clean_data = dp.impute_missing_values(clean_data, strategy='mean')
clean_data = dp.remove_duplicates(clean_data)
clean_data = dp.normalize_data(clean_data)
clean_data = dp.remove_redundant_features(clean_data)

# 3. Save the cleaned dataset
clean_data.to_csv('../Data/clean_data.csv', index=False)


#steps to ensure data and columns are ready for modelling 
target_column = clean_data.columns[0]
if clean_data[target_column].dtype != 'object' and clean_data[target_column].dtype != 'category':
    # If target is continuous, convert to binary or categorical labels (example)
    threshold = clean_data[target_column].median()
    clean_data[target_column] = (clean_data[target_column] > threshold).astype(int)  # Binary labels

# 4. Train and evaluate the model
dp.simple_model(clean_data)

Accuracy: 0.75


In [2]:
#Messy data summary stats

print("Messy Dataset Summary Statistics:")
print(messy_data.describe())

Messy Dataset Summary Statistics:
           target            b            c            f            h  \
count  920.000000  1196.000000  1196.000000  1196.000000  1158.000000   
mean     0.553261     0.004015    53.383779     0.001391   200.583765   
std      0.497426     0.984837     9.534033     1.011656   110.061582   
min      0.000000    -3.308750    28.000000    -2.820047     0.000000   
25%      0.000000    -0.692960    47.000000    -0.633350   176.250000   
50%      1.000000     0.004241    54.000000    -0.051965   224.000000   
75%      1.000000     0.714572    60.000000     0.693539   270.000000   
max      1.000000     2.982511    77.000000     3.323155   603.000000   

                j            k            l            n            o  \
count  591.000000  1196.000000  1196.000000  1012.000000  1008.000000   
mean    -0.600216     0.007269    -0.013268     4.904847   137.780754   
std      1.067309     2.498874     0.958741     0.200283    26.175380   
min     -5.95466

In [3]:
#clean data summary stats 
print("\nCleaned Dataset Summary Statistics:")
print(clean_data.describe())


Cleaned Dataset Summary Statistics:
           target           b           c           f           h           j  \
count  480.000000  480.000000  480.000000  480.000000  480.000000  480.000000   
mean     0.418750    0.522296    0.546599    0.464234    0.356896    0.726543   
std      0.493869    0.158945    0.184911    0.164020    0.163397    0.106851   
min      0.000000    0.000000    0.020408    0.013818    0.000000    0.000000   
25%      0.000000    0.415781    0.408163    0.357365    0.326700    0.731068   
50%      0.000000    0.521850    0.571429    0.459911    0.388060    0.731068   
75%      1.000000    0.628019    0.673469    0.581317    0.452736    0.735554   
max      1.000000    1.000000    1.000000    0.904455    0.935323    0.983555   

                k           l           n           t           v           w  \
count  480.000000  480.000000  480.000000  480.000000  480.000000  480.000000   
mean     0.472264    0.447130    0.719951    0.413376    0.662055    0.

In [4]:
#analyze rows and columns that were removed 

rows_removed = messy_data.shape[0] - clean_data.shape[0]
cols_removed = messy_data.shape[1] - clean_data.shape[1]
print(f"\nRows Removed: {rows_removed}")
print(f"Columns Removed: {cols_removed}")


Rows Removed: 716
Columns Removed: 6


In [5]:
# 3. Calculate how many features were removed

clean_data_before = messy_data.copy()
dropped_features_count = len(clean_data_before.columns) - len(clean_data.columns)

# 4. Print the cleaned data after removing redundant features
print("\nData After Removing Redundant Features:")
print(clean_data.head())

# 5. Print how many features were removed due to redundancy
print(f"\nNumber of features removed due to redundancy: {dropped_features_count}")


Data After Removing Redundant Features:
   target               a         b         c                  d          e  \
0       0  lv hypertrophy  0.610389  0.714286       fixed defect  Cleveland   
1       1  lv hypertrophy  0.358036  0.795918             normal  Cleveland   
2       1  lv hypertrophy  0.595613  0.795918  reversable defect  Cleveland   
3       0          normal  0.301262  0.183673             normal  Cleveland   
4       0  lv hypertrophy  0.311787  0.265306             normal  Cleveland   

          f                g         h      i  ...         l            m  \
0  0.505577   typical angina  0.386401   True  ...  0.319166  downsloping   
1  0.729423     asymptomatic  0.474295  False  ...  0.412870         flat   
2  0.409365     asymptomatic  0.379768  False  ...  0.124036         flat   
3  0.603448      non-anginal  0.414594  False  ...  0.387424  downsloping   
4  0.368214  atypical angina  0.338308  False  ...  0.411798    upsloping   

          n       s  