In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# Load CSV (First column = correct text, Second column = user-typed text)
df = pd.read_csv("typing_data.csv", header=None)

# Dictionary to store errors
error_data = []

# Function to compare two strings character by character (ignoring spaces)
def analyze_errors(correct, typed):
    correct = correct.replace(" ", "")  # Ignore spaces
    typed = typed.replace(" ", "")  # Ignore spaces

    min_length = min(len(correct), len(typed))

    for i in range(min_length):
        if correct[i] != typed[i]:  # Mismatch found
            error_data.append({
                "expected": correct[i], 
                "typed": typed[i], 
                "position": i,  # Position of error
                "error_type": "substitution"
            })
    
    # Handle missing characters
    for i in range(min_length, len(correct)):
        error_data.append({
            "expected": correct[i], 
            "typed": "_MISSING_", 
            "position": i,
            "error_type": "deletion"
        })

    # Handle extra characters
    for i in range(min_length, len(typed)):
        error_data.append({
            "expected": "_EXTRA_", 
            "typed": typed[i], 
            "position": i,
            "error_type": "insertion"
        })

# Process each row in the CSV
for _, row in df.iterrows():
    analyze_errors(row[0], row[1])  # Use 0 for correct_text, 1 for typed_text

# Convert to DataFrame
error_df = pd.DataFrame(error_data)

# Error Type Frequency (Substitution, Insertion, Deletion)
error_type_counts = error_df['error_type'].value_counts()

# Plotting Error Type Frequencies
plt.figure(figsize=(8, 6))
error_type_counts.plot(kind='bar', color=['lightblue', 'lightgreen', 'salmon'])
plt.title('Error Type Frequency')
plt.xlabel('Error Type')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.show()

# Suggestion 1: Focus on the most frequent error types.
# Show suggestions based on frequency
print("Most Frequent Error Types:")
for error_type, count in error_type_counts.items():
    if error_type == "substitution":
        print(f"\n- Substitution Errors: {count} mistakes.")
        print("  Suggestion: Pay attention to key positions and common typos like character pairs.")
    elif error_type == "deletion":
        print(f"\n- Deletion Errors: {count} mistakes.")
        print("  Suggestion: Slow down a bit and focus on not missing characters.")
    elif error_type == "insertion":
        print(f"\n- Insertion Errors: {count} mistakes.")
        print("  Suggestion: Ensure you're typing at the correct pace and avoiding extra characters.")

# Error Location Analysis
# Split into beginning, middle, and end positions of the word/phrase
total_chars = len(error_df)
beginning_threshold = total_chars // 3
middle_threshold = 2 * total_chars // 3

beginning_errors = error_df[error_df['position'] < beginning_threshold]
middle_errors = error_df[(error_df['position'] >= beginning_threshold) & (error_df['position'] < middle_threshold)]
end_errors = error_df[error_df['position'] >= middle_threshold]

# Plotting error location analysis
location_counts = [len(beginning_errors), len(middle_errors), len(end_errors)]
plt.figure(figsize=(8, 6))
plt.bar(['Beginning', 'Middle', 'End'], location_counts, color='lightcoral')
plt.title('Error Location Frequency')
plt.xlabel('Text Location')
plt.ylabel('Error Frequency')
plt.show()

# Suggestion 2: Focus on specific locations where you make most errors.
if len(beginning_errors) > len(middle_errors) and len(beginning_errors) > len(end_errors):
    print("\nYou make most errors at the beginning of the text.")
    print("Suggestion: Practice focusing on accuracy in the first few characters.")
elif len(middle_errors) > len(beginning_errors) and len(middle_errors) > len(end_errors):
    print("\nYou make most errors in the middle of the text.")
    print("Suggestion: Ensure you're not rushing through the middle of the text.")
else:
    print("\nYou make most errors towards the end of the text.")
    print("Suggestion: Slow down toward the end, as errors tend to increase when you're nearing completion.")

# Suggestion 3: Analyze if typing speed affects errors (this assumes you have timing data, e.g., time per character/word)
# If no timing data is available, we skip this suggestion, but if timing data were available, you'd use it to correlate errors with typing speed.


In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

# Load CSV (First column = correct text, Second column = user-typed text)
df = pd.read_csv("typing_data.csv", header=None)

# Dictionary to store errors
error_data = []

# Function to compare two strings character by character (ignoring spaces)
def analyze_errors(correct, typed):
    correct = correct.replace(" ", "")  # Ignore spaces
    typed = typed.replace(" ", "")  # Ignore spaces

    min_length = min(len(correct), len(typed))
    
    for i in range(min_length):
        if correct[i] != typed[i]:  # Mismatch found
            error_data.append({
                "expected": correct[i], 
                "typed": typed[i], 
                "prev_char": correct[i-1] if i > 0 else "START", 
                "next_char": correct[i+1] if i < len(correct)-1 else "END", 
                "error_type": "substitution"
            })
    
    # Handle missing characters
    for i in range(min_length, len(correct)):
        error_data.append({
            "expected": correct[i], 
            "typed": "_MISSING_", 
            "prev_char": correct[i-1] if i > 0 else "START", 
            "next_char": correct[i+1] if i < len(correct)-1 else "END", 
            "error_type": "deletion"
        })

    # Handle extra characters
    for i in range(min_length, len(typed)):
        error_data.append({
            "expected": "_EXTRA_", 
            "typed": typed[i], 
            "prev_char": typed[i-1] if i > 0 else "START", 
            "next_char": typed[i+1] if i < len(typed)-1 else "END", 
            "error_type": "insertion"
        })

# Process each row in the CSV
for _, row in df.iterrows():
    analyze_errors(row[0], row[1])  # Use 0 for correct_text, 1 for typed_text

# Convert to DataFrame
error_df = pd.DataFrame(error_data)

# Find most common mistakes
mistake_counts = error_df.groupby(["expected", "typed"]).size().reset_index(name="count")
mistake_counts = mistake_counts.sort_values(by="count", ascending=False).head(5)

# Display top 5 errors in a user-friendly way
print("Top 5 Common Typing Errors:")
for _, row in mistake_counts.iterrows():
    if row['expected'] == "_EXTRA_":
        print(f"Oops! You typed an extra character '{row['typed']}' instead of leaving it blank. This happened {row['count']} times.")
    elif row['expected'] == "_MISSING_":
        print(f"Oops! You missed the character '{row['typed']}' which was expected. This happened {row['count']} times.")
    else:
        print(f"Oops! You mistyped '{row['typed']}' instead of '{row['expected']}'. This happened {row['count']} times.")


In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from collections import Counter
import matplotlib.pyplot as plt

# Load CSV (First column = correct text, Second column = user-typed text)
df = pd.read_csv("typing_data.csv", header=None)

# Dictionary to store errors
error_data = []

# Function to compare two strings character by character (ignoring spaces)
def analyze_errors(correct, typed):
    correct = correct.replace(" ", "")  # Ignore spaces
    typed = typed.replace(" ", "")  # Ignore spaces

    min_length = min(len(correct), len(typed))
    
    for i in range(min_length):
        if correct[i] != typed[i]:  # Mismatch found
            error_data.append({
                "expected": correct[i], 
                "typed": typed[i], 
                "prev_char": correct[i-1] if i > 0 else "START", 
                "next_char": correct[i+1] if i < len(correct)-1 else "END", 
                "error_type": "substitution"
            })
    
    # Handle missing characters
    for i in range(min_length, len(correct)):
        error_data.append({
            "expected": correct[i], 
            "typed": "_MISSING_", 
            "prev_char": correct[i-1] if i > 0 else "START", 
            "next_char": correct[i+1] if i < len(correct)-1 else "END", 
            "error_type": "deletion"
        })

    # Handle extra characters
    for i in range(min_length, len(typed)):
        error_data.append({
            "expected": "_EXTRA_", 
            "typed": typed[i], 
            "prev_char": typed[i-1] if i > 0 else "START", 
            "next_char": typed[i+1] if i < len(typed)-1 else "END", 
            "error_type": "insertion"
        })

# Process each row in the CSV
for _, row in df.iterrows():
    analyze_errors(row[0], row[1])  # Use 0 for correct_text, 1 for typed_text

# Convert to DataFrame
error_df = pd.DataFrame(error_data)

# Find most common mistakes
mistake_counts = error_df.groupby(["expected", "typed"]).size().reset_index(name="count")
mistake_counts = mistake_counts.sort_values(by="count", ascending=False).head(5)

# Display top 5 errors in a user-friendly way
print("Top 5 Common Typing Errors:")
for _, row in mistake_counts.iterrows():
    if row['expected'] == "_EXTRA_":
        print(f"Oops! You typed an extra character '{row['typed']}' instead of leaving it blank. This happened {row['count']} times.")
    elif row['expected'] == "_MISSING_":
        print(f"Oops! You missed the character '{row['typed']}' which was expected. This happened {row['count']} times.")
    else:
        print(f"Oops! You mistyped '{row['typed']}' instead of '{row['expected']}'. This happened {row['count']} times.")



# Function to compare two strings character by character (ignoring spaces)
def analyze_errors(correct, typed):
    correct = correct.replace(" ", "")  # Ignore spaces
    typed = typed.replace(" ", "")  # Ignore spaces

    min_length = min(len(correct), len(typed))

    for i in range(min_length):
        if correct[i] != typed[i]:  # Mismatch found
            error_data.append({
                "expected": correct[i], 
                "typed": typed[i], 
                "position": i,  # Position of error
                "error_type": "substitution"
            })
    
    # Handle missing characters
    for i in range(min_length, len(correct)):
        error_data.append({
            "expected": correct[i], 
            "typed": "_MISSING_", 
            "position": i,
            "error_type": "deletion"
        })

    # Handle extra characters
    for i in range(min_length, len(typed)):
        error_data.append({
            "expected": "_EXTRA_", 
            "typed": typed[i], 
            "position": i,
            "error_type": "insertion"
        })

# Process each row in the CSV
for _, row in df.iterrows():
    analyze_errors(row[0], row[1])  # Use 0 for correct_text, 1 for typed_text

# Convert to DataFrame
error_df = pd.DataFrame(error_data)

# Error Type Frequency (Substitution, Insertion, Deletion)
error_type_counts = error_df['error_type'].value_counts()

# Plotting Error Type Frequencies
plt.figure(figsize=(8, 6))
error_type_counts.plot(kind='bar', color=['lightblue', 'lightgreen', 'salmon'])
plt.title('Error Type Frequency')
plt.xlabel('Error Type')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.show()

# Suggestion 1: Focus on the most frequent error types.
# Show suggestions based on frequency
print("Most Frequent Error Types:")
for error_type, count in error_type_counts.items():
    if error_type == "substitution":
        print(f"\n- Substitution Errors: {count} mistakes.")
        print("  Suggestion: Pay attention to key positions and common typos like character pairs.")
    elif error_type == "deletion":
        print(f"\n- Deletion Errors: {count} mistakes.")
        print("  Suggestion: Slow down a bit and focus on not missing characters.")
    elif error_type == "insertion":
        print(f"\n- Insertion Errors: {count} mistakes.")
        print("  Suggestion: Ensure you're typing at the correct pace and avoiding extra characters.")

# Error Location Analysis
# Split into beginning, middle, and end positions of the word/phrase
total_chars = len(error_df)
beginning_threshold = total_chars // 3
middle_threshold = 2 * total_chars // 3

beginning_errors = error_df[error_df['position'] < beginning_threshold]
middle_errors = error_df[(error_df['position'] >= beginning_threshold) & (error_df['position'] < middle_threshold)]
end_errors = error_df[error_df['position'] >= middle_threshold]

# Plotting error location analysis
location_counts = [len(beginning_errors), len(middle_errors), len(end_errors)]
plt.figure(figsize=(8, 6))
plt.bar(['Beginning', 'Middle', 'End'], location_counts, color='lightcoral')
plt.title('Error Location Frequency')
plt.xlabel('Text Location')
plt.ylabel('Error Frequency')
plt.show()

# Suggestion 2: Focus on specific locations where you make most errors.
if len(beginning_errors) > len(middle_errors) and len(beginning_errors) > len(end_errors):
    print("\nYou make most errors at the beginning of the text.")
    print("Suggestion: Practice focusing on accuracy in the first few characters.")
elif len(middle_errors) > len(beginning_errors) and len(middle_errors) > len(end_errors):
    print("\nYou make most errors in the middle of the text.")
    print("Suggestion: Ensure you're not rushing through the middle of the text.")
else:
    print("\nYou make most errors towards the end of the text.")
    print("Suggestion: Slow down toward the end, as errors tend to increase when you're nearing completion.")

# Suggestion 3: Analyze if typing speed affects errors (this assumes you have timing data, e.g., time per character/word)
# If no timing data is available, we skip this suggestion, but if timing data were available, you'd use it to correlate errors with typing speed.
