In [4]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('discussion_experience_author.csv')

# Drop rows with any empty values
df = df.dropna()

# Convert 'start_date' and 'end_date' to datetime format
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])

df['pr_count'] = df['PR_count']
df['confusion'] = df['Confusion']
df['non_confusion'] = df['Non_confusion']


# Calculate the difference in years between 'end_date' and 'start_date'
df['years_difference'] = (df['end_date'] - df['start_date']).dt.total_seconds() / (365.25 * 24 * 3600)

# Create a new DataFrame with only the 'years_difference' column
final_df = df[['years_difference', 'pr_count', 'confusion','non_confusion']]

# Display the resulting DataFrame
print(final_df)


       years_difference  pr_count confusion non_confusion
1              8.199789       210         4             3
3              0.000000         1         0             1
4             10.110495       445         4            14
5              0.558318         2         2             0
6              2.625128        11         2             0
...                 ...       ...       ...           ...
30704          0.268196         4         0             2
30706          2.472621        22         1             2
30707          0.000000         1         0             3
30708          0.446480         5         2             2
30709          0.000000         1         0             2

[24745 rows x 4 columns]


In [17]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Sample DataFrame setup (replace with your actual DataFrame)
# df = pd.read_csv('your_file.csv')  # Uncomment and use this if loading from a CSV
df = final_df

# Convert relevant columns to numeric, handling non-numeric values as NaN
df['years_difference'] = pd.to_numeric(df['years_difference'], errors='coerce')
df['confusion'] = pd.to_numeric(df['confusion'], errors='coerce')

# Drop rows with NaN values
df = df.dropna(subset=['years_difference', 'confusion'])

# Create low and high experience groups based on the mean of 'years_difference'
low_experience = df[df['years_difference'] <= 2.0]
high_experience = df[df['years_difference'] > 2.0]

# Perform the Mann-Whitney U test on confusion values between low and high experience groups
stat, p_value = mannwhitneyu(low_experience['confusion'], high_experience['confusion'], alternative='two-sided')

# Calculate median confusion values for interpretation
median_confusion_low = low_experience['confusion'].median()
median_confusion_high = high_experience['confusion'].median()

# Display the results
print("Mann-Whitney U Test between low and high experience confusion values")
print("U statistic:", stat)
print("p-value:", p_value)
print("Median confusion in low experience group:", median_confusion_low)
print("Median confusion in high experience group:", median_confusion_high)

# Interpretation based on p-value
if p_value < 0.05:  # commonly used significance level
    # Is higher experience associated with lower confusion? (i)
    if median_confusion_high < median_confusion_low:
        print("i) Higher experience developers have a lower rate of confusion: Yes")
    else:
        print("i) Higher experience developers have a lower rate of confusion: No")

    # Is lower experience associated with higher confusion? (ii)
    if median_confusion_low > median_confusion_high:
        print("ii) Lower experience developers have a higher rate of confusion: Yes")
    else:
        print("ii) Lower experience developers have a higher rate of confusion: No")
else:
    print("Fail to reject the null hypothesis: No significant difference found in confusion between low and high experience groups.")


Mann-Whitney U Test between low and high experience confusion values
U statistic: 54907916.0
p-value: 0.08286630836399064
Median confusion in low experience group: 1.0
Median confusion in high experience group: 1.0
Fail to reject the null hypothesis: No significant difference found in confusion between low and high experience groups.


In [18]:
import pandas as pd
from scipy.stats import mannwhitneyu

# Sample DataFrame setup (replace with your actual DataFrame)
# df = pd.read_csv('your_file.csv')  # Uncomment and use this if loading from a CSV
df = final_df

# Convert relevant columns to numeric, handling non-numeric values as NaN
df['years_difference'] = pd.to_numeric(df['years_difference'], errors='coerce')
df['confusion'] = pd.to_numeric(df['confusion'], errors='coerce')
df['pr_count'] = pd.to_numeric(df['pr_count'], errors='coerce')

# Drop rows with NaN values
df = df.dropna(subset=['pr_count', 'confusion'])

# Calculate the mean of 'pr_count' to split the data into two groups
mean_pr_count = df['pr_count'].mean()

# Create low and high pr_count groups based on the mean of 'pr_count'
low_pr_count = df[df['pr_count'] <= mean_pr_count]
high_pr_count = df[df['pr_count'] > mean_pr_count]

# Perform the Mann-Whitney U test on confusion values between low and high pr_count groups
stat, p_value = mannwhitneyu(low_pr_count['confusion'], high_pr_count['confusion'], alternative='two-sided')

# Calculate median confusion values for interpretation
median_confusion_low_pr = low_pr_count['confusion'].median()
median_confusion_high_pr = high_pr_count['confusion'].median()

# Display the results
print("Mann-Whitney U Test between low and high pr_count confusion values")
print("U statistic:", stat)
print("p-value:", p_value)
print("Median confusion in low pr_count group:", median_confusion_low_pr)
print("Median confusion in high pr_count group:", median_confusion_high_pr)

# Interpretation based on p-value
if p_value < 0.05:  # commonly used significance level
    # Is higher pr_count associated with lower confusion? (i)
    if median_confusion_high_pr < median_confusion_low_pr:
        print("i) More PR count developers have a lower rate of confusion: Yes")
    else:
        print("i) More PR count developers have a lower rate of confusion: No")

    # Is lower pr_count associated with higher confusion? (ii)
    if median_confusion_low_pr > median_confusion_high_pr:
        print("ii) Less PR count developers have a higher rate of confusion: Yes")
    else:
        print("ii) Less PR count developers have a higher rate of confusion: No")
else:
    print("Fail to reject the null hypothesis: No significant difference found in confusion between low and high pr_count groups.")


Mann-Whitney U Test between low and high pr_count confusion values
U statistic: 40737703.0
p-value: 0.00016798896227836416
Median confusion in low pr_count group: 1.0
Median confusion in high pr_count group: 1.0
i) More PR count developers have a lower rate of confusion: No
ii) Less PR count developers have a higher rate of confusion: No


In [25]:
# Sample DataFrame setup (replace with your actual DataFrame)
# df = pd.read_csv('your_file.csv')  # Uncomment and use this if loading from a CSV

# Filter the high experience group (years_difference > 2.0)
high_experience = df[df['years_difference'] > 2.0]

high_experience.sort_values(by='years_difference', ascending=False)

# Sort the high experience group by any column to get the top 100 rows (e.g., by years_difference)
top_100_high_experience = high_experience.head(1000)

# Check how many rows have 'confusion' > 'non_confusion'
top_100_high_experience['higher_confusion'] = top_100_high_experience['confusion'] > top_100_high_experience['non_confusion']

# Count the number of rows where confusion is higher than non_confusion
count_higher_confusion = top_100_high_experience['higher_confusion'].sum()

# Display the result
print(f"Number of rows where confusion > non_confusion in top 100 high experience developers: {count_higher_confusion}")


Number of rows where confusion > non_confusion in top 100 high experience developers: 152


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100_high_experience['higher_confusion'] = top_100_high_experience['confusion'] > top_100_high_experience['non_confusion']


In [27]:

top_100_high_experience.sort_values(by='years_difference', ascending=False)

Unnamed: 0,years_difference,pr_count,confusion,non_confusion,higher_confusion
4704,13.191168,2343,1.0,0.0,True
4710,13.100734,324,0.0,2.0,False
1082,12.574651,715,7.0,48.0,False
1358,11.893573,34,2.0,7.0,False
1096,11.795157,271,4.0,21.0,False
...,...,...,...,...,...
3282,2.010086,689,1.0,3.0,False
847,2.006939,2,1.0,2.0,False
3206,2.006410,189,0.0,1.0,False
4615,2.000896,76,6.0,6.0,False


In [22]:
# Sample DataFrame setup (replace with your actual DataFrame)
# df = pd.read_csv('your_file.csv')  # Uncomment and use this if loading from a CSV

# Define the threshold for 'pr_count'. For example, consider pr_count > mean(pr_count) as 'high' experience
mean_pr_count = df['pr_count'].mean()

# Filter the high pr_count group (pr_count > mean_pr_count)
high_pr_count = df[df['pr_count'] > mean_pr_count]

# Sort the high pr_count group by any column to get the top 100 rows (e.g., by pr_count)
top_100_high_pr_count = high_pr_count.head(1000)

# Check how many rows have 'confusion' > 'non_confusion'
top_100_high_pr_count['higher_confusion'] = top_100_high_pr_count['confusion'] > top_100_high_pr_count['non_confusion']

# Count the number of rows where confusion is higher than non_confusion
count_higher_confusion_pr = top_100_high_pr_count['higher_confusion'].sum()

# Display the result
print(f"Number of rows where confusion > non_confusion in top 100 high pr_count developers: {count_higher_confusion_pr}")


Number of rows where confusion > non_confusion in top 100 high pr_count developers: 165


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100_high_pr_count['higher_confusion'] = top_100_high_pr_count['confusion'] > top_100_high_pr_count['non_confusion']


In [28]:
top_100_high_pr_count
top_100_high_pr_count.sort_values(by='pr_count', ascending=False)

Unnamed: 0,years_difference,pr_count,confusion,non_confusion,higher_confusion
1849,6.389574,4741,789.0,2328.0,False
3211,6.324232,3846,1.0,3.0,False
473,7.957849,2982,0.0,11.0,False
1385,10.991050,2965,21.0,41.0,False
6033,5.182372,2825,0.0,1.0,False
...,...,...,...,...,...
3237,2.375732,42,14.0,12.0,True
4242,1.806930,42,0.0,1.0,False
4248,1.580598,42,1.0,0.0,True
1280,5.002336,42,1.0,11.0,False
