In [6]:
import pandas as pd
import numpy as np
from scipy import stats

In [7]:
# Define function to remove outliers using z-score
def remove_outliers(df):
    z = np.abs(stats.zscore(df))
    df = df[(z < 3).all(axis=1)]
    return df

In [8]:
# Define function to clean data
def clean_data(df):
    # Remove null values
    df = df.dropna()

    # Remove outliers
    df = remove_outliers(df)

    # Fill missing values with '0'
    df = df.fillna(0)

    # Ensure both columns have equal data points
    min_len = min(len(df['Before']), len(df['After']))
    df = df.iloc[:min_len, :]

    return df


In [9]:
# Loop through csv files in both folders
for folder in ['Intention', 'Operation']:
    for metric in ['LCOM', 'CBO', 'RFC', 'DIT', 'NOC', 'WMC', 'LOC']:
        file_path = f"{folder}/{metric}.csv"
        df = pd.read_csv(file_path)

        # Clean data
        df = clean_data(df)

        # Save cleaned data to file
        df.to_csv(f"{folder}/{metric}_cleaned.csv", index=False)

In [None]:
import pandas as pd
from scipy.stats import ttest_ind, ranksums

# Load data
intention_files = ['intention/LCOM.csv', 'intention/CBO.csv', 'intention/RFC.csv', 'intention/DIT.csv', 'intention/NOC.csv', 'intention/WMC.csv', 'intention/LOC.csv']
operation_files = ['operation/LCOM.csv', 'operation/CBO.csv', 'operation/RFC.csv', 'operation/DIT.csv', 'operation/NOC.csv', 'operation/WMC.csv', 'operation/LOC.csv']

intention_dfs = [pd.read_csv(file) for file in intention_files]
operation_dfs = [pd.read_csv(file) for file in operation_files]

# Calculate difference between Before and After columns for each metric
for i, metric in enumerate(['LCOM', 'CBO', 'RFC', 'DIT', 'NOC', 'WMC', 'LOC']):
    intention_diff = intention_dfs[i]['After'] - intention_dfs[i]['Before']
    operation_diff = operation_dfs[i]['After'] - operation_dfs[i]['Before']
    
    # Perform statistical tests
    t, t_pval = ttest_ind(intention_diff, operation_diff)
    u, u_pval = ranksums(intention_diff, operation_diff)
    
    # Report results
    print(f"Results for {metric}:")
    print(f"Mean improvement for intention group: {intention_diff.mean()}")
    print(f"Mean improvement for operation group: {operation_diff.mean()}")
    print(f"t-test p-value: {t_pval}")
    print(f"Wilcoxon rank-sum p-value: {u_pval}")
