# Data Quality Control

The datasets consist of 105 columns (103 numerical and 2 categorical) with 1 column for target (binary)

Our job is to find the best classification methods to classified the rows. 

In [None]:
# Importing the libraries 

import pandas as pd
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, f1_score, accuracy_score, precision_score, classification_report

# Open the data
data  = pd.read_csv('DM_Project_24.csv')

# Statistical information 
print(data.describe()) 
#Lot of missing data found because total count only ~1400-1500

# See data in nicer way 
data.head(10)


# Data Quality Control 
## Removal of Data
- See if we can remove NA 

In [None]:
#Removal of NA
df_deleting = data.copy()
df_deleting = df_deleting.dropna()

print(df_deleting.info())
#571 left after the removal - which is not good 

df_deleting.head(10)

## Imputation
We separate the numerical and categorical columns since we want approach them differently
- Numerical: using mean 
- Categorical: using mode

In [None]:
### Imputation for all class ### 

# Separate features and target
X_train = data.iloc[:, :-1]  # Features: columns 0-105
y_train = data.iloc[:, -1]   # Target: column 106

# Columns: numerical and categorical
numerical_cols = X_train.columns[:-2]  # First 103 columns
categorical_cols = (X_train.columns[-2:]) # Last 2 columns

# Preprocessing for numerical data
numerical_imputer = SimpleImputer(strategy = "mean")
X_num = pd.DataFrame(
    numerical_imputer.fit_transform(X_train[numerical_cols]),
    columns=numerical_cols)

# Categorical Data Imputation
categorical_imputer = SimpleImputer(strategy='most_frequent')

X_cat_imputed = pd.DataFrame(
    categorical_imputer.fit_transform(X_train[categorical_cols]),
    columns=categorical_cols)

# Combine the categories
X_processed = pd.concat([X_num, X_cat_imputed], axis=1)

print(X_processed.head())

#Cross-validation
#Calculate F1
cv_impu = cross_val_score(rf, X_processed, y_train, cv=KFold(n_splits=5), scoring= "f1")
print(f'F1 score across folds: {cv_impu.mean():.4f}')
#F1 = 0.7325 for imputation 

#Calculate accuracy 
cv_impu_acc = cross_val_score(rf, X_processed, y_train, cv=KFold(n_splits=5), scoring= "accuracy")
print(f'Accuracy score across folds: {cv_impu_acc.mean():.4f}')
#accuracy is 0.9456

In [None]:
### Imputation for specific-class value ### 
# Separate features and target
X_train = data.iloc[:, :-1]  # Features: columns 0-105
y_train = data.iloc[:, -1]   # Target: column 106

# Columns: numerical and categorical
numerical_cols = X_train.columns[:-2].tolist()  # First 103 columns
categorical_cols = X_train.columns[-2:].tolist() # Last 2 columns

# Separate the data based on class
class_groups = data.groupby(y_train)

# Doing class specific imputation 
imputed_data = []
for label, group in class_groups:
    #Imputation for numerical 
    numerical_imputer = SimpleImputer(strategy = "mean")
    group[numerical_cols] = numerical_imputer.fit_transform(group[numerical_cols])

    #Imputation for categorical 
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    group[categorical_cols] = categorical_imputer.fit_transform(group[categorical_cols])
    
    imputed_data.append(group)

#Cross-validation
X_imputed_class = imputed_data.iloc[:, :-1]  # Features: columns 0-105
y_imputed_class = imputed_data.iloc[:, -1]

#Calculate F1
cv_impu = cross_val_score(rf, X_imputed_class, y_imputed_class, cv=KFold(n_splits=5), scoring= "f1")
print(f'F1 score across folds: {cv_impu.mean():.4f}')
#F1 = 0.8217 for imputation 

#Calculate accuracy 
cv_impu_acc = cross_val_score(rf, X_imputed_class, y_imputed_class, cv=KFold(n_splits=5), scoring= "accuracy")
print(f'Accuracy score across folds: {cv_impu_acc.mean():.4f}')
#accuracy is 0.9621

# Since the highest CV is using class-specific imputation, we used that dataset
# for future analysis 

##  Import it as dataframe ##
X_imputed_class = pd.concat(imputed_data).sort_index()
print(X_imputed_class)

## Normalisation

In [None]:
# For all class imputation
standard_scaler = StandardScaler()

#Normalize the numerical 
x_num_scaled = standard_scaler.fit_transform(X_num)
X_num_df = pd.DataFrame(x_num_scaled, columns=numerical_cols)

#Combination
X_processed_standard = pd.concat([X_num_df, X_cat_imputed], axis=1)
print(X_processed_standard)

#Cross validation 
#Using F1
cv_standard_all = cross_val_score(rf, X_processed_standard, y_train, cv=KFold(n_splits=5), scoring= "f1")
print(f'F1 score across folds: {cv_standard_all.mean():.4f}')
#F1 = 0.7325  

#Using accuracy 
cv_standard_acc = cross_val_score(rf, X_processed_standard, y_train, cv=KFold(n_splits=5), scoring= "accuracy")
print(f'Accuracy score across folds: {cv_standard_acc.mean():.4f}')
#accuracy is 0.9456

### Normalisation ###
# For class-specific imputation

#separate the numerical and categorical
X = X_imputed_class.iloc[:, :-1]  # Features: columns 0-105
y = X_imputed_class.iloc[:, -1]   # Target: column 106

# Columns: numerical and categorical
num_x_train = X.columns[:-2]  # First 103 columns
nom_x_train = X.columns[-2:] # Last 2 columns

# Separate numerical and categorical data
X_num = X[num_x_train]
X_cat = X[nom_x_train]

# Ensure numerical columns are actually numeric
X_num = X_num.apply(pd.to_numeric, errors='coerce')

## Standardization (Z-score normalization) ## 
standard_scaler = StandardScaler()
X_num_standardized = pd.DataFrame(
    standard_scaler.fit_transform(X_num),
    columns=X_num.columns,
    index=X_num.index)

#Combine the data 
X_standardized = pd.concat([X_num_standardized, X_cat], axis=1)

#Class specific imputation + z-score normalisation 
#Calculate F1
cv_standard_class = cross_val_score(rf, X_standardized, y_train, cv=KFold(n_splits=5), scoring= "f1")
print(f'F1 score across folds: {cv_standard_class.mean():.4f}')
#F1 = 0.7897 

#Calculate accuracy 
cv_acc_standard_c = cross_val_score(rf, X_standardized, y_train, cv=KFold(n_splits=5), scoring= "accuracy")
print(f'Accuracy score across folds: {cv_acc_standard_c.mean():.4f}')
#accuracy is 0.9556

##  Min-Max scaling ##
minmax_scaler = MinMaxScaler()
X_num_minmax = pd.DataFrame(
    minmax_scaler.fit_transform(X_num),
    columns=X_num.columns,
    index=X_num.index)
#combine the data 
X_minmax = pd.concat([X_num_minmax, X_cat], axis=1)

# Calculate F1
cv_minmax_class = cross_val_score(rf, X_minmax, y_train, cv=KFold(n_splits=5), scoring= "f1")
print(f'F1 score across folds: {cv_minmax_class.mean():.4f}')
#F1 = 0.7897 

#Calculate accuracy 
cv_acc_minmax_c = cross_val_score(rf, X_minmax, y_train, cv=KFold(n_splits=5), scoring= "accuracy")
print(f'Accuracy score across folds: {cv_acc_minmax_c.mean():.4f}')
#accuracy is 0.9556

# They have the same F1 score, so we will proceed with class-specific imputation and z-score normalisation. 

# Combine scaled numerical data with categorical data
X_standardized = pd.concat([X_num_standardized, X_cat], axis=1)

print(X_standardized)

### Normalisation for test data ### 

# Import the data
test_data = pd.read_csv("test_data.csv")
print(test_data)

# Columns: numerical and categorical
num_x_train = test_data.columns[:-2]  # First 103 columns
nom_x_train = test_data.columns[-2:] # Last 2 columns

# Separate numerical and ategorical data
X_num = test_data[num_x_train]
X_cat = test_data[nom_x_train]

# Ensure numerical columns are actually numeric
X_num = X_num.apply(pd.to_numeric, errors='coerce')

# Standardization (Z-score normalization)
standard_scaler = StandardScaler()
X_num_standardized = pd.DataFrame(
    standard_scaler.fit_transform(X_num),
    columns=X_num.columns,
    index=X_num.index)

# Import it to the data
test_standardized = pd.concat([X_num_standardized, X_cat], axis=1)
test_standardized.to_csv('test_standard.csv', index=False)

## Removing Outliers

In [None]:
# Importing Extra libraries 
from sklearn.ensemble import IsolationForest

#X_standardized = all the x
#y_train = as y

#)utliers detection using random forest 
IF = IsolationForest(random_state=0, contamination = 0.01)
outliers = IF.fit_predict(X_standardized)
mask = outliers == 1
x_clean = X_standardized[mask]
y_clean = y_train[mask]

print(y_clean)

#Cross validation 
#accuracy
cv_if_acc = cross_val_score(rf, x_clean, y_clean, cv=KFold(n_splits=5), scoring="accuracy")
print(f'Accuracy score across folds: {cv_if_acc.mean():.4f}')
#accuracy = 0.9583

#f1
cv_if = cross_val_score(rf, x_clean, y_clean, cv=KFold(n_splits=5), scoring= "f1")
print(f'F1 score across folds: {cv_if.mean():.4f}')
#F1 = 0.8102 

# Removal of outliers using LOF 
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01)  # Adjust neighbors and contamination
outliers = lof.fit_predict(X_standardized)
mask = outliers == 1
x_clean_lof = X_standardized[mask]
y_clean_lof = y_train[mask]

#cross validation 
#accuracy
cv_if_acc = cross_val_score(rf, x_clean_lof, y_clean_lof, cv=KFold(n_splits=5), scoring="accuracy")
print(f'Accuracy score across folds: {cv_if_acc.mean():.4f}')
#accuracy = 0.9621

#f1
cv_if = cross_val_score(rf, x_clean_lof, y_clean_lof, cv=KFold(n_splits=5), scoring= "f1")
print(f'F1 score across folds: {cv_if.mean():.4f}')
#F1 = 0.8217 

# LOF has higher F1 so we will choose this method. 

## Combining the data for clean datasets ##
clean_data = pd.concat([x_clean_lof, y_clean_lof], axis=1)
print(clean_data)
clean_data.to_csv('train_data.csv', index=False)