### Monitor Data Quality Trends Over Time

**Task 1**: Create a Trends Analysis Report

**Objective**: Understand long-term data quality trends.

**Steps**:
1. Use historical data (or simulate data) to analyze how data quality has changed over time.
2. Calculate trends for the KPIs defined earlier using statistical measures or visual charts.
3. Write a report summarizing your findings, noting any persistent issues or improvements.

In [1]:
# Write your code from here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Simulate 12 months of data quality KPI scores
np.random.seed(42)

months = pd.date_range(start='2024-01-01', periods=12, freq='M')
accuracy = np.clip(np.random.normal(loc=85, scale=5, size=12), 70, 100)
completeness = np.clip(np.random.normal(loc=80, scale=7, size=12), 60, 100)
timeliness = np.clip(np.random.normal(loc=75, scale=10, size=12), 50, 100)

kpi_trends = pd.DataFrame({
    'Month': months,
    'Accuracy Rate': accuracy,
    'Completeness Score': completeness,
    'Timeliness Index': timeliness
})

print(kpi_trends)
# Set style
plt.style.use('seaborn-whitegrid')

# Plot the KPI trends
plt.figure(figsize=(10, 6))
plt.plot(kpi_trends['Month'], kpi_trends['Accuracy Rate'], label='Accuracy Rate', marker='o')
plt.plot(kpi_trends['Month'], kpi_trends['Completeness Score'], label='Completeness Score', marker='s')
plt.plot(kpi_trends['Month'], kpi_trends['Timeliness Index'], label='Timeliness Index', marker='^')

plt.title('Data Quality KPI Trends Over Time')
plt.xlabel('Month')
plt.ylabel('Percentage (%)')
plt.ylim(50, 100)
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



        Month  Accuracy Rate  Completeness Score  Timeliness Index
0  2024-01-31      87.483571           81.693736         69.556173
1  2024-02-29      84.308678           66.607038         76.109226
2  2024-03-31      88.238443           67.925575         63.490064
3  2024-04-30      92.615149           76.063987         78.756980
4  2024-05-31      83.829233           72.910182         68.993613
5  2024-06-30      83.829315           82.199731         72.083063
6  2024-07-31      92.896064           73.643831         68.982934
7  2024-08-31      88.837174           70.113874         93.522782
8  2024-09-30      82.652628           90.259541         74.865028
9  2024-10-31      87.712800           78.419566         64.422891
10 2024-11-30      82.682912           80.472697         83.225449
11 2024-12-31      82.671351           70.026763         62.791564


OSError: 'seaborn-whitegrid' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

**Task 2**: Evaluate Continuous Improvement Measures

**Objective**: Implement strategic changes based on trend analysis.

**Steps**:
1. Identify patterns or recurring issues from your trend analysis report.
2. Propose three continuous improvement strategies to address these issues.
3. Plan how to implement these strategies and measure their effectiveness over the next cycle.

In [None]:
# Write your code from here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Simulate historical KPI data for 12 months
np.random.seed(42)
months = pd.date_range(start='2024-01-01', periods=12, freq='M')

# Base KPI generation
accuracy = np.clip(np.random.normal(loc=85, scale=3, size=12), 75, 95)
completeness = np.clip(np.random.normal(loc=78, scale=5, size=12), 65, 90)
timeliness = np.clip(np.random.normal(loc=70, scale=8, size=12), 50, 90)

# Create initial KPI DataFrame
kpis = pd.DataFrame({
    'Month': months,
    'Accuracy Rate': accuracy,
    'Completeness Score': completeness,
    'Timeliness Index': timeliness
})

# 2. Plot original trends
plt.figure(figsize=(10, 6))
plt.plot(kpis['Month'], kpis['Accuracy Rate'], label='Accuracy Rate', marker='o')
plt.plot(kpis['Month'], kpis['Completeness Score'], label='Completeness Score', marker='s')
plt.plot(kpis['Month'], kpis['Timeliness Index'], label='Timeliness Index', marker='^')
plt.title('Original Data Quality KPI Trends (2024)')
plt.ylabel('Percentage (%)')
plt.ylim(50, 100)
plt.xlabel('Month')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 3. Identify issues based on thresholds
issues = {
    'Low Accuracy Months': kpis[kpis['Accuracy Rate'] < 80]['Month'].dt.strftime('%B').tolist(),
    'Low Completeness Months': kpis[kpis['Completeness Score'] < 75]['Month'].dt.strftime('%B').tolist(),
    'Low Timeliness Months': kpis[kpis['Timeliness Index'] < 65]['Month'].dt.strftime('%B').tolist()
}

print("=== Data Quality Issues Identified ===")
for issue, months in issues.items():
    print(f"{issue}: {', '.join(months)}")

# 4. Simulate effect of improvement strategies in next 6 months
future_months = pd.date_range(start='2025-01-01', periods=6, freq='M')
accuracy_improved = np.clip(np.random.normal(loc=88, scale=2, size=6), 80, 95)
completeness_improved = np.clip(np.random.normal(loc=90, scale=2, size=6), 85, 95)
timeliness_improved = np.clip(np.random.normal(loc=85, scale=3, size=6), 80, 95)

future_kpis = pd.DataFrame({
    'Month': future_months,
    'Accuracy Rate': accuracy_improved,
    'Completeness Score': completeness_improved,
    'Timeliness Index': timeliness_improved
})

# 5. Combine both DataFrames
combined_kpis = pd.concat([kpis, future_kpis], ignore_index=True)

# 6. Visualize trend post-strategy
plt.figure(figsize=(10, 6))
plt.plot(combined_kpis['Month'], combined_kpis['Accuracy Rate'], label='Accuracy Rate', marker='o')
plt.plot(combined_kpis['Month'], combined_kpis['Completeness Score'], label='Completeness Score', marker='s')
plt.plot(combined_kpis['Month'], combined_kpis['Timeliness Index'], label='Timeliness Index', marker='^')
plt.title('KPI Trends Before and After Improvement Strategies')
plt.ylabel('Percentage (%)')
plt.ylim(50, 100)
plt.xlabel('Month')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 7. Summary statistics before and after
print("\n=== KPI Summary ===")
before_avg = kpis.mean(numeric_only=True).round(2)
after_avg = future_kpis.mean(numeric_only=True).round(2)
summary_df = pd.DataFrame({'Before Strategies': before_avg, 'After Strategies': after_avg})
print(summary_df)

