Unique row in jnose data

In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('/content/TS_Dataset.csv')

unique_rows = data.drop_duplicates()

unique_rows_count = len(unique_rows)

print(f"Number of unique rows: {unique_rows_count}")

unique_rows.to_csv('/content/unique_rows.csv', index=False)

print("\nThe unique rows have been saved as 'unique_rows.csv'.")


Mapping - Smelly Flaky

In [None]:
import pandas as pd

_smells = pd.read_csv("/content/unique_rows.csv")
_flaky = pd.read_csv("/content/flaky.csv")

print("Initial row counts:")
print(f"_smells rows: {len(_smells)}")
print(f"_flaky rows: {len(_flaky)}")

_flaky['methodName'] = _flaky['Fully-Qualified Test Name (packageName.ClassName.methodName)'].str.split('.').str[-1]


_smells_unique = _smells.drop_duplicates(subset=['testSmellMethod'])
_flaky_unique = _flaky.drop_duplicates(subset=['methodName'])

print("\nUnique row counts after deduplication:")
print(f"_smells_unique rows: {len(_smells_unique)}")
print(f"_flaky_unique rows: {len(_flaky_unique)}")

# Step 3: Merge datasets where testSmellMethod matches methodName
merged_df = pd.merge(
    _smells_unique,
    _flaky_unique,
    left_on='testSmellMethod',
    right_on='methodName',
    how='inner' 
)

curated_columns = [
    'projectName', 'name', 'pathFile', 'productionFile', 'loc', 'qtdMethods',
    'testSmellName', 'testSmellMethod', 'testSmellLineBegin', 'testSmellLineEnd',
    'methodNameFullHash', 'methodCode', 'Project URL', 'Module Path',
    'Fully-Qualified Test Name (packageName.ClassName.methodName)', 'Category', 'Status'
]

mapped_dataset = merged_df.reindex(columns=curated_columns)

mapped_dataset.to_csv('/content/TSFK_dataset.csv', index=False)

print("\nFinal Mapped Dataset Info:")
print(f"Total rows: {len(mapped_dataset)}")
print(f"Columns: {list(mapped_dataset.columns)}")
print("\nFirst 5 rows of the mapped dataset:")
print(mapped_dataset.head())

Prevalance of smells

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

data = pd.read_csv('/content/unique_rows.csv')

missing_values = data['testSmellName'].isnull().sum()
print(f"Number of missing/null values in 'testSmellName': {missing_values}")

test_smell_frequency = data['testSmellName'].value_counts(dropna=False)

print("\nFrequency of each test smell (including missing/null values):")
print(test_smell_frequency)

total_count = test_smell_frequency.sum()
print(f"\nTotal count of test smells: {total_count}")
print(f"Total rows in the dataset: {len(data)}")

csv_path = '/content/TS_frequency.csv'
test_smell_frequency.to_csv(csv_path, header=True)

chart_path = '/content/TS_frequency_chart.png'

plt.figure(figsize=(6, 4))
ax = test_smell_frequency.plot(kind='bar', color='gray', width=0.8)
plt.title('Frequency of Test Smells', fontsize=12, fontweight='bold')
plt.xlabel('Test Smell Name', fontsize=10)
plt.ylabel('Frequency', fontsize=10)
plt.xticks(rotation=45, ha='right', fontsize=9)
plt.yticks(fontsize=9)
plt.tight_layout()

plt.savefig(chart_path, dpi=300, bbox_inches='tight', facecolor='white')

plt.show()

print("\nThe frequency results have been saved as 'test_smell_frequency.csv'.")
print("The bar chart has been saved as 'test_smell_frequency_chart.png'.")

files.download(csv_path)
files.download(chart_path)


Find Occurrences of Test Smell in Projects

In [None]:
import pandas as pd

data = pd.read_csv('/content/unique_rows.csv')

unique_projects = data['projectName'].nunique()
print(f"Number of unique projects: {unique_projects}")

project_smell_frequencies = data.groupby('projectName')['testSmellName'].value_counts(dropna=False).unstack(fill_value=0)

print("\nUnique test smells and their frequencies for each project:")
print(project_smell_frequencies)

project_smell_frequencies.to_csv('/content/project_VS_smell_frequencies.csv')

print("\nThe project smell frequencies have been saved as 'project_smell_frequencies.csv'.")


#Ei 20 ta project er row gula newa from pr-data

Find Occurrences of Flaky Tests in Projects




In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('/content/flaky.csv')

category_analysis = data.groupby(['Project URL', 'Category']).size().reset_index(name='count')

flakiness_matrix = category_analysis.pivot(index='Project URL', columns='Category', values='count').fillna(0)

print("\nFlakiness types for each project and their corresponding counts in matrix format:")
print(flakiness_matrix)

flakiness_matrix.to_csv('/content/ALL_project_flakiness_matrix.csv')


flakiness_counts = category_analysis['Category'].value_counts()

plt.figure(figsize=(10, 6))
flakiness_counts.plot(kind='bar', color='skyblue')
plt.title('Frequency of Each Flakiness Type')
plt.xlabel('Flakiness Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


#all flakiness and 20 projects

In [None]:
import pandas as pd

data = pd.read_csv('/content/Dataset - ALL_project_flakiness_matrix.csv')
projects_to_keep = [
    "Mapper", "Sentinel", "adyen-java-api-library", "aem-core-wcm-components",
    "airlift", "alibabacloud-tairjedis-sdk", "asset-share-commons", "asyncload",
    "druid", "easy-retry", "fastjson2", "ignite-3", "innodb-java-reader",
    "jackson-dataformats-binary", "jetcache", "mercury", "metrics", "tamper",
    "testable-mock", "wasp"
]

data['Project Name'] = data['Project URL'].apply(lambda x: x.split('/')[-1])

filtered_data = data[data['Project Name'].isin(projects_to_keep)]

filtered_data.to_csv('/content/filtered_projects.csv', index=False)

print("\nFiltered Projects Data:")
print(filtered_data)
print("\nThe filtered projects data has been saved as 'filtered_projects.csv'.")


#for 20 projects

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('/content/flaky.csv')

# Extract project name from Project URL (assuming it's the last part after '/')
data['Project Name'] = data['Project URL'].apply(lambda x: x.rstrip('/').split('/')[-1])

# Define the list of 20 projects to keep
selected_projects = {
    "Mapper", "Sentinel", "adyen-java-api-library", "aem-core-wcm-components", "airlift",
    "alibabacloud-tairjedis-sdk", "asset-share-commons", "asyncload", "druid", "easy-retry",
    "fastjson2", "ignite-3", "innodb-java-reader", "jackson-dataformats-binary", "jetcache",
    "mercury", "metrics", "tamper", "testable-mock", "wasp"
}

# Filter data to include only selected projects
data = data[data['Project Name'].isin(selected_projects)]

# Step 1: Analyze the 'Category' column and count the flakiness occurrences per project
category_analysis = data.groupby(['Project Name', 'Category']).size().reset_index(name='count')

# Step 2: Pivot the data to create a matrix format
flakiness_matrix = category_analysis.pivot(index='Project Name', columns='Category', values='count').fillna(0)

# Display the result
print("\nFlakiness types for each project and their corresponding counts in matrix format:")
print(flakiness_matrix)

# Save the result as a CSV file
flakiness_matrix.to_csv('/content/selected_project_flakiness_matrix.csv')
print("\nThe project flakiness analysis has been saved as 'selected_project_flakiness_matrix.csv'.")

# Step 3: Plot the frequency bar chart for each flakiness type
flakiness_counts = category_analysis['Category'].value_counts()

# Create the bar chart
plt.figure(figsize=(10, 6))
flakiness_counts.plot(kind='bar', color='skyblue')
plt.title('Frequency of Each Flakiness Type')
plt.xlabel('Flakiness Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Show the plot
plt.show()


**Keep only prominent 4 types of flakiness**





In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('/content/flaky.csv')

data['Project Name'] = data['Project URL'].apply(lambda x: x.split('/')[-1])

category_analysis = data.groupby(['Project Name', 'Category']).size().reset_index(name='count')

top_4_flakiness = category_analysis['Category'].value_counts().nlargest(4).index

filtered_category_analysis = category_analysis[category_analysis['Category'].isin(top_4_flakiness)]

flakiness_matrix = filtered_category_analysis.pivot(index='Project Name', columns='Category', values='count').fillna(0)

print("\nFlakiness types for each project and their corresponding counts (only top 5 types) in matrix format:")
print(flakiness_matrix)


flakiness_matrix.to_csv('/content/Top_4_project_flakiness_matrix.csv')


print("\nThe top 4 project flakiness analysis has been saved as 'Top_4_project_flakiness_matrix.csv'.")

flakiness_counts_top_4 = filtered_category_analysis['Category'].value_counts()

plt.figure(figsize=(10, 6))
flakiness_counts_top_4.plot(kind='bar', color='skyblue')
plt.title('Top 4 Frequency of Flakiness Types')
plt.xlabel('Flakiness Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


# TS FK correlation

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv('/content/20_TSFK_COMBO.csv')

flakiness_columns = [
    "Assertion Roulette", "Conditional Test Logic", "Constructor Initialization",
    "Duplicate Assert", "Eager Test", "EmptyTest", "Exception Catching Throwing",
    "General Fixture", "IgnoredTest", "Lazy Test", "Magic Number Test",
    "Mystery Guest", "Print Statement", "Redundant Assertion", "Resource Optimism",
    "Sensitive Equality", "Sleepy Test", "Unknown Test", "Verbose Test"
]

other_columns = ["ID", "NOD", "OD", "OD-Vic"]

filtered_data = data[flakiness_columns + other_columns]

correlation_matrix = filtered_data.corr(method='spearman')

correlation_matrix.to_csv('/content/spearman_correlation_matrix.csv')

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Spearman Correlation Heatmap between Flakiness Types and Project Metrics', fontsize=16)
plt.tight_layout()
plt.show()

print("\nSpearman correlation matrix has been saved as 'spearman_correlation_matrix.csv'.")


#Filtered Heatmap

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('/content/spearman_correlation_matrix_TSFK.csv', index_col=0)

columns_to_keep = ['ID', 'NOD', 'OD', 'OD-Vic']

index_to_keep = [
    'Assertion Roulette', 'Conditional Test Logic', 'Constructor Initialization',
    'Duplicate Assert', 'Eager Test', 'EmptyTest', 'Exception Catching Throwing',
    'General Fixture', 'IgnoredTest', 'Lazy Test', 'Magic Number Test', 'Mystery Guest',
    'Print Statement', 'Redundant Assertion', 'Resource Optimism', 'Sensitive Equality',
    'Sleepy Test', 'Unknown Test', 'Verbose Test'
]

filtered_df = df.loc[index_to_keep, columns_to_keep]

plt.figure(figsize=(10, 8))
sns.heatmap(filtered_df, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

plt.title("Filtered Correlation Heatmap")
plt.xlabel('Test Categories')
plt.ylabel('Test Cases')
plt.show()

# Save the filtered data to a new CSV (optional)
filtered_df.to_csv('/content/filtered_heatmap.csv', index=True)


# Significance

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import t
import seaborn as sns
import matplotlib.pyplot as plt

# Data from your provided table
data = {
    'Smells Name': ['Assertion Roulette', 'Conditional Test Logic', 'Constructor Initialization',
                    'Duplicate Assert', 'Eager Test', 'EmptyTest', 'Exception Catching Throwing',
                    'General Fixture', 'IgnoredTest', 'Lazy Test', 'Magic Number Test',
                    'Mystery Guest', 'Print Statement', 'Redundant Assertion', 'Resource Optimism',
                    'Sensitive Equality', 'Sleepy Test', 'Unknown Test', 'Verbose Test'],
    'ID': [0.3837945947, 0.2869365078, 0.3198117193, 0.2617129525, 0.4066156391, 0.1478101162,
           0.3523641011, 0.4859859818, 0.2465690068, 0.3778286912, 0.4047169576, 0.2503716789,
           0.1725349117, 0.2175405406, 0.3058848929, 0.4440452583, -0.03132093892, 0.4589919716,
           0.2523770663],
    'NOD': [0.3712414186, 0.4709077868, 0.3848436998, 0.3434998942, 0.3695873995, 0.2505408773,
            0.2908361592, 0.09756566537, 0.262434638, 0.3127277996, 0.393970485, 0.1103735496,
            0.2961916451, 0.5430240888, 0.09134282393, 0.5096051749, 0.4536745075, 0.1786836534,
            0.438103865],
    'OD': [0.2971779072, 0.4906726393, 0.3116918607, 0.3526087765, 0.3065008816, 0.3458628886,
           0.5005632066, 0.2314075986, 0.3421774429, 0.2838937243, 0.4014508571, 0.1152560345,
           0.4402765073, 0.6018400053, 0.09055305203, 0.3921198188, 0.4253088895, 0.2773155047,
           0.3349776998],
    'OD-Vic': [-0.1278086926, 0.06388582158, 0.01090465857, -0.0378948328, -0.08917623154,
               0.3659288565, 0.2024359594, 0.04165073453, 0.009057580871, -0.08047611139,
               -0.06781685731, 0.154407501, 0.3457175445, 0.07714206152, 0.1253058177,
               -0.1745708541, 0.09869286212, 0.03533903602, -0.04831709259]
}

df = pd.DataFrame(data)
df.set_index('Smells Name', inplace=True)

n = 30
def calculate_p_value(rho, n):
    rho = np.clip(rho, -0.999, 0.999)
    t_stat = rho * np.sqrt((n - 2) / (1 - rho**2))
    p_val = 2 * (1 - t.cdf(np.abs(t_stat), df=n-2))
    return p_val

p_values = df.apply(lambda x: [calculate_p_value(r, n) for r in x], axis=0)
p_values.index = df.index

significance_df = (p_values < 0.05).astype(int)

plt.figure(figsize=(10, 8))
sns.heatmap(significance_df, annot=True, cmap='binary', fmt='d', linewidths=0.5, cbar=False)
plt.title(f"Significance Heatmap (1 = Significant, 0 = Not Significant, n = {n})")
plt.xlabel('Metrics')
plt.ylabel('Test Smells')
plt.tight_layout()
plt.show()

significance_df.to_csv('20VS4_significance_heatmap.csv', index=True)
print("Significance matrix saved to '20VS4_significance_heatmap.csv'")