In [6]:
import pandas as pd
import numpy as np
from scipy import stats

In [7]:
# Define function to remove outliers using z-score
def remove_outliers(df):
    z = np.abs(stats.zscore(df))
    df = df[(z < 3).all(axis=1)]
    return df

In [8]:
# Define function to clean data
def clean_data(df):
    # Remove null values
    df = df.dropna()

    # Remove outliers
    df = remove_outliers(df)

    # Fill missing values with '0'
    df = df.fillna(0)

    # Ensure both columns have equal data points
    min_len = min(len(df['Before']), len(df['After']))
    df = df.iloc[:min_len, :]

    return df


In [9]:
# Loop through csv files in both folders
for folder in ['Intention', 'Operation']:
    for metric in ['LCOM', 'CBO', 'RFC', 'DIT', 'NOC', 'WMC', 'LOC']:
        file_path = f"{folder}/{metric}.csv"
        df = pd.read_csv(file_path)

        # Clean data
        df = clean_data(df)

        # Save cleaned data to file
        df.to_csv(f"{folder}/{metric}_cleaned.csv", index=False)

In [None]:
#Mean Before , After and P-Value Calcualtion

import pandas as pd
from scipy.stats import ttest_rel

# List of metric names
metrics = ['LOC', 'CBO', 'NOC', 'DIT', 'RFC', 'LCOM', 'WMC']

# Empty lists to store means and p-values for each metric
means_before = []
means_after = []
p_values = []

means_before_op = []
means_after_op = []
p_values_op = []

# Loop over each metric
for metric in metrics:
    # Load CSV file for current metric
    df = pd.read_csv(f'/Users/pranay/Stevens/Spring 23/SSW-567-A Software Testing/Assignments/Group/Repository/TestingProject/Intention/cleaned_{metric}.csv')
    df_op = pd.read_csv(f'/Users/pranay/Stevens/Spring 23/SSW-567-A Software Testing/Assignments/Group/Repository/TestingProject/Operation/cleaned_{metric}.csv')

    # Calculate mean for "before" and "after" columns
    mean_before = df['Before'].mean()
    mean_after = df['After'].mean()
    
    mean_before_op = df_op['Before'].mean()
    mean_after_op = df_op['After'].mean()
    
    
    # Calculate p-value using a two-sided t-test
    _, p_value = ttest_rel(df['Before'], df['After'])
    _, p_value_op = ttest_rel(df_op['Before'], df_op['After'])
    
    # Append means and p-value to their respective lists
    means_before.append(mean_before)
    means_after.append(mean_after)
    p_values.append(p_value)
    
    means_before_op.append(mean_before_op)
    means_after_op.append(mean_after_op)
    p_values_op.append(p_value_op)
    

# Create a new dataframe with the results
results_df = pd.DataFrame({'Metric': metrics, 'Mean Before': means_before, 'Mean After': means_after, 'P-value': p_values})

results_df_op = pd.DataFrame({'Metric': metrics, 'Mean Before': means_before_op, 'Mean After': means_after_op, 'P-value': p_values_op})

# Format p-values as floating-point numbers with 4 decimal places
results_df['P-value'] = results_df['P-value'].apply(lambda x: f'{x:.8f}')
results_df_op['P-value'] = results_df_op['P-value'].apply(lambda x: f'{x:.8f}')

# Print the results dataframe
print("Intention")
print(results_df)
print("\nOperation")