Frequency distribution of First-Last Distance.

In [7]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the data from the CSV file
df = pd.read_csv('word_distances.csv')

# Extract the 'first-last dist' column
distances = df['first-last dist']

# Plot the distribution
plt.figure(figsize=(6, 4), dpi=200)
plt.hist(distances, bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('First-Last Distance')
plt.ylabel('Frequency')

# Save the plot as a PNG file
plt.savefig('dist_distribution.png')
plt.close()

Precision at different First-Last Distance.

In [10]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
df = pd.read_csv('judgement.csv')

# Sort by 'first-last dist'
df_sorted = df.sort_values(by='first-last dist')

# Define a function to calculate precision
def calculate_precision(sub_df):
    total = len(sub_df)
    if total == 0:
        return 0
    true_positives = (sub_df['change'] == 'yes').sum()
    return true_positives / total

# Initialize lists to store results
distances = []
precisions = []

# Calculate precision for each 50-row window
window_size = 50
for start in range(0, len(df_sorted), window_size):
    end = start + window_size
    subset = df_sorted.iloc[start:end]
    dist = subset['first-last dist'].min()
    precision = calculate_precision(subset)
    
    distances.append(dist)
    precisions.append(precision)

# Plot the results
plt.figure(figsize=(6, 4), dpi=200)
plt.plot(distances, precisions, marker='o', linestyle='-', color='b')
plt.xlabel('Minimum First-Last Distance in Window')
plt.ylabel('Precision')
plt.savefig('precision.png')
plt.close()

Distribution of change time.

In [9]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. Read the CSV file
df = pd.read_csv('judgement.csv')

# 2. Filter rows where the 'change' column is 'yes'
df_change = df[df['change'] == 'yes'].copy()

# 3. Parse the 'time' column to extract year(s) and store them in a list
def parse_years(time_str):
    if '/' in time_str:
        # If the string contains a '/', split it into two years and return them as a list
        year_start, year_end = time_str.split('/')
        return [int(year_start), int(year_end)]
    else:
        # If no '/', return the single year in a list
        return [int(time_str)]

# Apply the parsing function to the 'time' column and create a new column with lists of years
df_change['years'] = df_change['time'].apply(parse_years)

# Flatten the list of lists into a single list of years
all_years = [year for sublist in df_change['years'] for year in sublist]

# 4. Plot a histogram of the year distribution
plt.figure(figsize=(6, 4), dpi=200)  # Set the figure size
plt.hist(all_years, bins=range(min(all_years), max(all_years) + 10, 5), edgecolor='black')
plt.xlabel('Year')  # X-axis label
plt.ylabel('Frequency')  # Y-axis label
plt.savefig('time_distribution.png')
plt.close()