<a href="https://colab.research.google.com/github/OnzyBoy/Pandas_CDAV/blob/main/Kenya_Literacy_Trends_and_Insights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt #Visualisations
import seaborn as sns #Heatmap

In [None]:
#Count Number of Literate people (M/W)
df = pd.read_csv("literacy_levels_cleaned.csv")
gender_lit_type_count = df.value_counts("Indicator")

#Counts per year
#2003
yr_2003 = df[df['SurveyYear'] == 2003]
yr_2003_lit_count = yr_2003.value_counts("Indicator")
yr_2003_lit_count

#2008
yr_2008 = df[df['SurveyYear'] == 2008]
yr_2008_lit_count = yr_2008.value_counts("Indicator")
yr_2008_lit_count

#2014
yr_2014 = df[df['SurveyYear'] == 2014]
yr_2014_lit_count = yr_2014.value_counts("Indicator")
yr_2014_lit_count

#2015
yr_2015 = df[df['SurveyYear'] == 2015]
yr_2015_lit_count = yr_2015.value_counts("Indicator")
yr_2015_lit_count

#2020
yr_2020 = df[df['SurveyYear'] == 2020]
yr_2020_lit_count = yr_2020.value_counts("Indicator")
yr_2020_lit_count


#2022
yr_2022 = df[df['SurveyYear'] == 2022]
yr_2022_lit_count = yr_2022.value_counts("Indicator")
yr_2022_lit_count


FileNotFoundError: [Errno 2] No such file or directory: 'literacy_levels_cleaned.csv'

In [None]:
#Output of Men and Women who took lit tests in each year
print(f"Men and Women who took a literature Test in 2003: \n{yr_2003_lit_count} \n")
print(f"Men and Women who took a literature Test in 2008: \n{yr_2008_lit_count} \n")
print(f"Men and Women who took a literature Test in 2014: \n{yr_2014_lit_count} \n")
print(f"Men and Women who took a literature Test in 2015: \n{yr_2015_lit_count} \n")
print(f"Men and Women who took a literature Test in 2020: \n{yr_2020_lit_count} \n")
print(f"Men and Women who took a literature Test in 2022: \n{yr_2022_lit_count} \n")

#2015 and 2020 - Only women took the literature tests (*Peak in 2015)

# % Share of Each Literacy Category
gender_lit_type_pct = (gender_lit_type_count / gender_lit_type_count.sum()) * 100
print(gender_lit_type_pct)

# each survey collects the same number of entries per category.


In [None]:
# Plot
plt.figure(figsize=(10, 6))
bars = gender_lit_type_count.plot(kind="bar", color="skyblue", edgecolor="black")

plt.title("Distribution of Literacy Categories (Men & Women)", fontsize=14)
plt.ylabel("Count", fontsize=12)
plt.xlabel("Literacy Category", fontsize=12)
plt.xticks(rotation=90)

# Add value labels
for i, value in enumerate(gender_lit_type_count):
    plt.text(i, value + 0.5, str(value), ha="center", va="bottom", fontsize=9)

plt.tight_layout()
plt.show()



In [None]:
# Count values per literacy category for each year
lit_counts_per_year = df.groupby(["SurveyYear", "Indicator"]).size().unstack(fill_value=0)

# Plot grouped bar chart
ax = lit_counts_per_year.T.plot(kind="bar", figsize=(16, 8), width=0.8)

plt.title("Distribution of Literacy Categories by Year", fontsize=16)
plt.xlabel("Literacy Category", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(rotation=90)
plt.legend(title="Survey Year")
plt.grid(axis="y", linestyle="--", alpha=0.7)

# Add labels on each bar
for container in ax.containers:
    ax.bar_label(container, fmt='%d', label_type='edge', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Filter for only 'Men who are literate' and 'Women who are literate'
lit_df = df[df["Indicator"].isin(["Men who are literate", "Women who are literate"])]

# Group by year and gender
avg_lit = lit_df.groupby(["SurveyYear", "Indicator"])["Value"].mean().unstack()

# Plot trend
ax = avg_lit.plot(kind="line", marker="o", figsize=(10, 6))
plt.title("Literacy Levels Over Time (Men vs Women)", fontsize=16)
plt.ylabel("Average Literacy (%)", fontsize=12)
plt.xlabel("Survey Year", fontsize=12)
plt.grid(True, linestyle="--", alpha=0.7)
plt.legend(title="Gender")

# Add labels on each point
for line in ax.lines:
    for x, y in zip(line.get_xdata(), line.get_ydata()):
        ax.text(x, y + 0.3, f"{y:.1f}", ha='center', fontsize=9)

plt.show()


NameError: name 'df' is not defined

In [None]:
#Men Only for all survey year
men_df = df[df["Indicator"].str.contains(r"\bMen\b", case=False, regex=True)]
#Women Only for every survey year
women_df = df[df["Indicator"].str.contains(r"\bWomen\b", case=False, regex=True)]

#Group (W) by location
location_women = women_df.groupby('Location')['Indicator'].value_counts()
#Group (M) by location
location_men = men_df.groupby('Location')['Indicator'].value_counts()
print(location_men.to_string())
location_women


In [None]:
#Literacy Level by Location
df.groupby("Location")["Value"].mean().sort_values(ascending=False)

NameError: name 'df' is not defined

In [None]:
#Average Literacy Levels per survey year
avg_per_year = df.groupby("SurveyYear")["Value"].mean()
print("\nAverage literacy per year:\n", avg_per_year)

In [None]:
# # # Top 3 regions
# top_3 = df.groupby(["Location","SurveyYear"])["Value"].mean().sort_values(ascending=False).head(3)
# print("Top 3 regions:\n", top_3)

# # Bottom 3 regions
# bottom_3 = df.groupby(["Location","SurveyYear"])["Value"].mean().sort_values().head(3)
# print("\nBottom 3 regions:\n", bottom_3)

In [None]:
#Top 3 regions per survey year
top_3_per_year = (
    df.groupby("SurveyYear")
      .apply(lambda x: x.groupby("Location")["Value"].mean()
                        .nlargest(3))
)

print("Top 3 per survey year : ")
print(top_3_per_year)

# Alt Code
avg_per_year = df.groupby(["SurveyYear", "Location"])["Value"].mean().reset_index()
top_3_per_year = avg_per_year.sort_values(["SurveyYear", "Value"], ascending=[True, False])
top_3_per_year = top_3_per_year.groupby("SurveyYear").head(3)

# print("Top 3 per survey year : ")
# print(top_3_per_year)

#Bottom 3 regions per survey year
# Average literacy per location per year
avg_per_year = df.groupby(["SurveyYear", "Location"])["Value"].mean().reset_index()

# Bottom 3 per year
bottom_3_per_year_lambda = avg_per_year.groupby("SurveyYear").apply(
    lambda g: g.nsmallest(3, "Value")
).reset_index(drop=True)

print("Bottom 3 per survey year: ")
print(bottom_3_per_year_lambda)

#Alt Code
# Average literacy per location per year
avg_per_year = df.groupby(["SurveyYear", "Location"])["Value"].mean().reset_index()

# Sort by year ascending & value ascending
sorted_df = avg_per_year.sort_values(["SurveyYear", "Value"], ascending=[True, True])

# Pick the first 3 of each year
bottom_3_per_year_no_lambda = sorted_df.groupby("SurveyYear").head(3)

# print("Bottom 3 per survey year : ")
# print(bottom_3_per_year_no_lambda)

NameError: name 'df' is not defined

In [None]:
#Gender Literacy Gap
df['Gender'] = df['Indicator'].str.extract(r'^(Men|Women)')
gender_avg = df.groupby(['SurveyYear', 'Gender'])['Value'].mean()
print(gender_avg)


In [None]:
indicator_avg = df.groupby(['SurveyYear', 'Indicator'])['Value'].mean()
print(indicator_avg.to_string())


In [None]:
#Location vs Value
# One-hot encode locations
df_encoded = pd.get_dummies(df["Location"])

# Add literacy values
df_encoded["Value"] = df["Value"]

# Correlation for each location
correlation_matrix = df_encoded.corr()["Value"].sort_values(ascending=False)
print(correlation_matrix)


In [None]:
#Literacy vs Time ( Survey Year)

yearly_avg = df.groupby("SurveyYear")["Value"].mean().reset_index()

# Calculate correlation between year and literacy value
year_corr = yearly_avg["SurveyYear"].corr(yearly_avg["Value"])
print("Correlation between Year and Literacy Value:", year_corr)

print(yearly_avg)


In [None]:
# Create a correlation matrix
corr_matrix = yearly_avg.corr()

# Plot heatmap
plt.figure(figsize=(6,4))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
plt.title("Correlation Heatmap: Year vs Literacy Value", fontsize=14)
plt.show()


NameError: name 'yearly_avg' is not defined

In [None]:
#Literacy vs Gender
# Create gender column from 'Indicator'
df['Gender'] = df['Indicator'].apply(lambda x: 'Men' if x.startswith('Men') else 'Women')

# Average literacy by gender per survey year
gender_avg = df.groupby(["SurveyYear", "Gender"])["Value"].mean().unstack()

print("\nAverage Literacy by Gender per Year:\n", gender_avg)

# Correlation between men and women literacy across years
gender_corr = gender_avg["Men"].corr(gender_avg["Women"])
print("\nCorrelation between Men's and Women's literacy levels:", gender_corr)


In [None]:
df.head()

In [None]:
df.corr()