# The Basics

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
df= pd.read_excel('cleandataiqr2911.xlsx', index_col=0)

In [3]:
identifier_columns = ['Country','ASBH02A']
demographic_info_columns = ['ASBH02A', 'ASBH02B', 'ASBH03A', 'ASBH04', 'ASBH15A', 'ASBH15B', 'ASBH16', 'ASBH17A', 'ASBH17B', 'ASBH18AA', 'ASBH18AB', 'ASBG01', 'ASBG03', 'ASDAGE','MINAGEARRIVAL' ]
positive_feelings_in_school = ['ASBG10A', 'ASBG10B','ASBG10C', 'ASBG10D', 'ASBG10E', 'ASBG10F']
negativeexperience_in_school_columns = ['ASBG11A', 'ASBG11B', 'ASBG11C', 'ASBG11D', 'ASBG11E', 'ASBG11F', 'ASBG11G', 'ASBG11H', 'ASBG11I', 'ASBG11J']
assessment_score_columns = ['ASRREA01', 'ASRREA02', 'ASRREA03', 'ASRREA04', 'ASRREA05', 'ASRLIT01', 'ASRLIT02', 'ASRLIT03', 'ASRLIT04', 'ASRLIT05', 'ASRINF01', 'ASRINF02', 'ASRINF03', 'ASRINF04', 'ASRINF05', 'ASRIIE01', 'ASRIIE02', 'ASRIIE03', 'ASRIIE04', 'ASRIIE05', 'ASRRSI01', 'ASRRSI02', 'ASRRSI03', 'ASRRSI04', 'ASRRSI05']
# averages = ['reading_avg', 'literary_purpose_avg', 'informational_purpose_avg','interpreting_process_avg', 'straightforward_process_avg','avgscore']

 1: Agree a lot; 2: Agree a little; 3: Disagree a little; 4: Disagree a lot\
 
ASBG10A	GEN\AGREE\BEING IN SCHOOL\
ASBG10B	GEN\AGREE\SAFE AT SCHOOL\
ASBG10C	GEN\AGREE\BELONG AT SCHOOL\
ASBG10D	GEN\AGREE\TEACHERS ARE FAIR\
ASBG10E	GEN\AGREE\PROUD TO GO TO SCHOOL\
ASBG10F	GEN\AGREE\FRIENDS AT SCHOOL

In [4]:
df['MINAGEARRIVAL'].value_counts(dropna=False)

NaN    38469
0.0     1108
3.0      852
6.0      480
8.0      249
Name: MINAGEARRIVAL, dtype: int64

In [5]:
df['MINAGEARRIVAL'] = df['MINAGEARRIVAL'].fillna(-1.0)

In [6]:
df['MINAGEARRIVAL'].value_counts(dropna=False)

-1.0    38469
 0.0     1108
 3.0      852
 6.0      480
 8.0      249
Name: MINAGEARRIVAL, dtype: int64

# Feelings in School

## Feelings Heat Map

In [None]:
df_feelings = df[identifier_columns+positive_feelings_in_school]

In [None]:
pivot_feelings = df_feelings.pivot_table(values=positive_feelings_in_school, index=identifier_columns, aggfunc='mean')

In [None]:
# Calculate the total average for each row
pivot_feelings['MeanTotal'] = pivot_feelings[['ASBG10A','ASBG10B','ASBG10C','ASBG10D','ASBG10E','ASBG10F']].mean(axis=1)

# Sort countries by the total average
country_averages = pivot_feelings.groupby('Country')['MeanTotal'].mean().sort_values(ascending=True).index.tolist()

### Create a custom sorter for ASBH02A
asbh02a_sorter = ['Yes', 'No']

In [None]:
# Get the current index as a DataFrame for sorting
index_feelings = pivot_feelings.index.to_frame()

In [None]:
# Rename the columns to avoid conflict
index_feelings = index_feelings.rename(columns={'Country': 'Country_', 'ASBH02A': 'ASBH02A_'})

# Sort the DataFrame first by Country using the sorted_countries and then by ASBH02A using the custom sorter
index_feelings['Country_'] = pd.Categorical(index_feelings['Country_'], categories=country_averages, ordered=True)
index_feelings['ASBH02A_'] = pd.Categorical(index_feelings['ASBH02A_'], categories=asbh02a_sorter, ordered=True)
index_feelings = index_feelings.sort_values(by=['Country_', 'ASBH02A_'])

# Reindex the pivot table using the sorted index
sorted_pivot_feelings = pivot_feelings.loc[index_feelings.index]

In [None]:
# Plot the heat map
plt.figure(figsize=(10, 8))
sns.heatmap(sorted_pivot_feelings, annot=True, cmap='YlGnBu')
plt.title('Mean Experience in School Heat Map Sorted by Country and Immigration Status')
plt.show()

## Feelings Average Distribution

In [None]:
df['feelingsavg'] = df[positive_feelings_in_school].mean(axis=1)

In [None]:
df['feelingsavg_binned'] = pd.cut(df['feelingsavg'], bins=10)

In [None]:
# Create a contingency table
contingency_feelings = pd.crosstab(index = [df['ASBH02A'], df['Country']],columns=df['feelingsavg_binned'])

In [None]:
# Convert the contingency table to a format suitable for Plotly
contingency_feelings_reset = contingency_feelings.reset_index()
contingency_feelings_melted = contingency_feelings_reset.melt(id_vars=['ASBH02A','Country'], var_name='feelingsavg_binned', value_name='count')
# Convert Interval objects to strings
contingency_feelings_melted['feelingsavg_binned'] = contingency_feelings_melted['feelingsavg_binned'].astype(str)

In [None]:
df_yes = contingency_feelings_melted[contingency_feelings_melted['ASBH02A'] == 'Yes']

In [None]:
heatmap_yes = df_yes.pivot(index='Country',columns="feelingsavg_binned",values = "count")

In [None]:
heatmap_yes["total"]= heatmap_yes.sum(axis=1)
# Assuming your DataFrame is named df
# Copy the total column to a variable, we'll use it for division later
total_column_yes = heatmap_yes['total']

# Dividing all columns except the 'total' column by the 'total' column of that row
df_percentage_yes = heatmap_yes.div(total_column_yes, axis=0) * 100
df_percentage_yes = df_percentage_yes.drop("total", axis = 1)
# Convert the contingency table to a format suitable for Plotly
df_percentage_yes_reset = df_percentage_yes.reset_index()
df_percentage__yes_melted = df_percentage_yes_reset.melt(id_vars='Country', var_name='feelingsavg_binned', value_name='percentage')

In [None]:
# Convert Interval objects to strings
df_percentage__yes_melted['feelingsavg_binned'] = df_percentage__yes_melted['feelingsavg_binned'].astype(str)

In [None]:
# Convert feelingsavg_binned to categorical with the specified order
df_percentage__yes_melted['feelingsavg_binned'] = pd.Categorical(df_percentage__yes_melted['feelingsavg_binned'], 
                                       categories=[
                                           '(0.997, 1.3]','(1.3, 1.6]','(1.6, 1.9]','(1.9, 2.2]','(2.2, 2.5]','(2.5, 2.8]','(2.8, 3.1]','(3.1, 3.4]','(3.4, 3.7]','(3.7, 4.0]'], 
                                       ordered=True)

In [None]:
heatmap_percent_yes = df_percentage__yes_melted.pivot(index = "feelingsavg_binned", columns = "Country", values = "percentage")

In [None]:
df_no = contingency_feelings_melted[contingency_feelings_melted['ASBH02A'] == 'No']
df_no= df_no.drop('ASBH02A', axis=1)
heatmap_no = df_no.pivot(index='Country',columns="feelingsavg_binned",values = "count")

In [None]:
heatmap_no["total"] = heatmap_no.sum(axis=1)
# Assuming your DataFrame is named df
# Copy the total column to a variable, we'll use it for division later
total_column_no = heatmap_no['total']

# Dividing all columns except the 'total' column by the 'total' column of that row
df_percentage_no = heatmap_no.div(total_column_no, axis=0) * 100
df_percentage_no = df_percentage_no.drop("total", axis=1)
# Convert the contingency table to a format suitable for Plotly
df_percentage_no_reset = df_percentage_no.reset_index()
df_percentage_no_melted = df_percentage_no_reset.melt(id_vars='Country', var_name='feelingsavg_binned', value_name='percentage')


In [None]:
# Convert Interval objects to strings
df_percentage_no_melted['feelingsavg_binned'] = df_percentage_no_melted['feelingsavg_binned'].astype(str)

# Convert feelingsavg_binned to categorical with the specified order
# Convert feelingsavg_binned to categorical with the specified order
df_percentage_no_melted['feelingsavg_binned'] = pd.Categorical(df_percentage_no_melted['feelingsavg_binned'], 
                                       categories=[
                                           '(0.997, 1.3]','(1.3, 1.6]','(1.6, 1.9]','(1.9, 2.2]','(2.2, 2.5]','(2.5, 2.8]','(2.8, 3.1]','(3.1, 3.4]','(3.4, 3.7]','(3.7, 4.0]'], 
                                       ordered=True)

heatmap_percent_no = df_percentage_no_melted.pivot(index="feelingsavg_binned", columns="Country", values="percentage")


In [None]:
# Create a figure and set of subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Plot heatmaps
sns.heatmap(heatmap_percent_yes, ax=axes[0], cmap='YlOrRd', annot=True, cbar=False)
sns.heatmap(heatmap_percent_no, ax=axes[1], cmap='YlOrRd', annot= True, cbar=False)

# Set titles
axes[0].set_title('Born in country')
axes[1].set_title('Not born in country')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
diff_data = heatmap_percent_yes - heatmap_percent_no

# Plot the difference heatmap
sns.heatmap(diff_data, cmap="coolwarm", annot = True, center=0)

## Minimum age of arrival

Now I want to see how age of arrival affects average score. I need to first clean the minagearrival column - adding 'NA' for those who were born in country

In [None]:
pivot_minage_feelings = df.pivot_table(values='feelingsavg', index=['Country'],columns=['MINAGEARRIVAL'],dropna = False)

In [None]:
pivot_minage_feelings

In [None]:
def process_pivot_table(pivot_table, group_by_column):
    """
    Processes a pivot table by calculating mean scores for each group, 
    sorting by the mean total, and reindexing the table accordingly.

    Parameters:
    pivot_table (pd.DataFrame): The pivot table with a multi-index.
    group_by_column (str): The column name in the index to group by.

    Returns:
    pd.DataFrame: The sorted pivot table.
    """
    # Calculate mean scores for the specified group and add a total mean column
    mean_scores = (
        pivot_table.groupby(level=group_by_column).mean()
        .assign(MeanTotal=lambda df: df.mean(axis=1))
        .sort_values(by='MeanTotal', ascending=False)
    )

    # Extract and prepare the index DataFrame
    index_df = pivot_table.index.to_frame()

    # Rename all index columns to avoid conflicts
    index_df = index_df.rename(columns={col: f"{col}_" for col in index_df.columns})

    # Dynamically handle sorting for the specified group column
    group_by_column_renamed = f"{group_by_column}_"
    index_df[group_by_column_renamed] = pd.Categorical(
        index_df[group_by_column_renamed], categories=mean_scores.index, ordered=True
    )

    # Sort index DataFrame
    sorted_index = index_df.sort_values(by=[group_by_column_renamed]).index

    # Reindex and return the sorted pivot table
    return pivot_table.loc[sorted_index]


In [None]:
sorted_minage_feelings = process_pivot_table(pivot_minage_feelings,group_by_column='Country')

In [None]:
# Plot the heat map
plt.figure(figsize=(10, 8))
sns.heatmap(sorted_minage_feelings, annot=True, cmap='YlGnBu')
plt.title('Mean score in School Heat Map Sorted by Country and age of arrival in the country')
plt.show()

# Experience in School

## Experience in School Heatmap

In [7]:
df_experience = df[identifier_columns+negativeexperience_in_school_columns]

In [None]:
pivot_experience = df_experience.pivot_table(values=negativeexperience_in_school_columns, index=identifier_columns, aggfunc='mean')

In [None]:
pivot_experience

In [None]:
# Calculate the total average for each row
pivot_experience['MeanTotal'] = pivot_experience[['ASBG11A', 'ASBG11B', 'ASBG11C', 'ASBG11D', 'ASBG11E', 'ASBG11F', 'ASBG11G', 'ASBG11H', 'ASBG11I', 'ASBG11J']].mean(axis=1)

# Sort countries by the total average
country_averages = pivot_experience.groupby('Country')['MeanTotal'].mean().sort_values(ascending=False).index.tolist()

### Create a custom sorter for ASBH02A
asbh02a_sorter = ['Yes', 'No']

In [None]:
# Get the current index as a DataFrame for sorting
index_experience = pivot_experience.index.to_frame()

In [None]:
# Rename the columns to avoid conflict
index_experience = index_experience.rename(columns={'Country': 'Country_', 'ASBH02A': 'ASBH02A_'})

# Sort the DataFrame first by Country using the sorted_countries and then by ASBH02A using the custom sorter
index_experience['Country_'] = pd.Categorical(index_experience['Country_'], categories=country_averages, ordered=True)
index_experience['ASBH02A_'] = pd.Categorical(index_experience['ASBH02A_'], categories=asbh02a_sorter, ordered=True)
index_experience = index_experience.sort_values(by=['Country_', 'ASBH02A_'])

# Reindex the pivot table using the sorted index
sorted_pivot_df = pivot_experience.loc[index_experience.index]

In [None]:
# Plot the heat map
plt.figure(figsize=(10, 8))
sns.heatmap(sorted_pivot_df, annot=True, cmap='YlGnBu')
plt.title('Mean Experience in School Heat Map Sorted by Country and Immigration Status')
plt.show()

 1: At least once a week; 2: Once or twice a month; 3: A few times a year; 4: Never
 
ASBG11A	GEN\HOW OFTEN\MADE FUN OF\
ASBG11B	GEN\HOW OFTEN\LEFT OUT OF GAMES\
ASBG11C	GEN\HOW OFTEN\SPREADING LIES ABOUT ME\
ASBG11D	GEN\HOW OFTEN\STEALING STH FROM ME\
ASBG11E	GEN\HOW OFTEN\DAMAGED STH OF MINE\
ASBG11F	GEN\HOW OFTEN\HIT OR HURT ME\
ASBG11G	GEN\HOW OFTEN\MADE ME DO\
ASBG11H	GEN\HOW OFTEN\NASTY OF HURTFUL MESSAGES\
ASBG11I	GEN\HOW OFTEN\NASTY OF HURTFUL INFO\
ASBG11J	GEN\HOW OFTEN\THREATENED ME\

darker colours are better

## Experience Average Distribution

In [8]:
df['experienceavg'] = df[negativeexperience_in_school_columns].mean(axis=1)

In [None]:
df['experienceavg_binned'] = pd.cut(df['experienceavg'], bins=10)

In [None]:
# Create a contingency table
contingency_experience = pd.crosstab(index = [df['ASBH02A'], df['Country']],columns=df['experienceavg_binned'])

In [None]:
# Convert the contingency table to a format suitable for Plotly
contingency_experience_reset = contingency_experience.reset_index()
contingency_experience_melted = contingency_experience_reset.melt(id_vars=['ASBH02A','Country'], var_name='experienceavg_binned', value_name='count')
# Convert Interval objects to strings
contingency_experience_melted['experienceavg_binned'] = contingency_experience_melted['experienceavg_binned'].astype(str)

In [None]:
filtered_experience_yes = contingency_experience_melted[contingency_experience_melted['ASBH02A'] == 'Yes']

heatmap_yes = filtered_experience_yes.pivot(index='Country',columns="experienceavg_binned",values = "count")

heatmap_yes["total"]= heatmap_yes.sum(axis=1)
# Assuming your DataFrame is named df
# Copy the total column to a variable, we'll use it for division later
total_column_yes = heatmap_yes['total']

# Dividing all columns except the 'total' column by the 'total' column of that row
df_percentage_yes = heatmap_yes.div(total_column_yes, axis=0) * 100
df_percentage_yes = df_percentage_yes.drop("total", axis = 1)
# Convert the contingency table to a format suitable for Plotly
df_percentage_yes_reset = df_percentage_yes.reset_index()
df_percentage__yes_melted = df_percentage_yes_reset.melt(id_vars='Country', var_name='experienceavg_binned', value_name='percentage')

# Convert Interval objects to strings
df_percentage__yes_melted['experienceavg_binned'] = df_percentage__yes_melted['experienceavg_binned'].astype(str)

# Convert experienceavg_binned to categorical with the specified order
df_percentage__yes_melted['experienceavg_binned'] = pd.Categorical(df_percentage__yes_melted['experienceavg_binned'], 
                                       categories=[
                                           '(0.997, 1.3]','(1.3, 1.6]','(1.6, 1.9]','(1.9, 2.2]','(2.2, 2.5]','(2.5, 2.8]','(2.8, 3.1]','(3.1, 3.4]','(3.4, 3.7]','(3.7, 4.0]'], 
                                       ordered=True)

heatmap_percent_yes = df_percentage__yes_melted.pivot(index = "experienceavg_binned", columns = "Country", values = "percentage")

df_no = contingency_experience_melted[contingency_experience_melted['ASBH02A'] == 'No']
df_no= df_no.drop('ASBH02A', axis=1)
heatmap_no = df_no.pivot(index='Country',columns="experienceavg_binned",values = "count")

heatmap_no["total"] = heatmap_no.sum(axis=1)
# Assuming your DataFrame is named df
# Copy the total column to a variable, we'll use it for division later
total_column_no = heatmap_no['total']

# Dividing all columns except the 'total' column by the 'total' column of that row
df_percentage_no = heatmap_no.div(total_column_no, axis=0) * 100
df_percentage_no = df_percentage_no.drop("total", axis=1)
# Convert the contingency table to a format suitable for Plotly
df_percentage_no_reset = df_percentage_no.reset_index()
df_percentage_no_melted = df_percentage_no_reset.melt(id_vars='Country', var_name='experienceavg_binned', value_name='percentage')


# Convert Interval objects to strings
df_percentage_no_melted['experienceavg_binned'] = df_percentage_no_melted['experienceavg_binned'].astype(str)

# Convert experienceavg_binned to categorical with the specified order

df_percentage_no_melted['experienceavg_binned'] = pd.Categorical(df_percentage_no_melted['experienceavg_binned'], 
                                       categories=[
                                           '(0.997, 1.3]','(1.3, 1.6]','(1.6, 1.9]','(1.9, 2.2]','(2.2, 2.5]','(2.5, 2.8]','(2.8, 3.1]','(3.1, 3.4]','(3.4, 3.7]','(3.7, 4.0]'], 
                                       ordered=True)

heatmap_percent_no = df_percentage_no_melted.pivot(index="experienceavg_binned", columns="Country", values="percentage")


In [None]:
# Create a figure and set of subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Plot heatmaps
sns.heatmap(heatmap_percent_yes, ax=axes[0], cmap='YlOrRd', annot=True, cbar=False)
sns.heatmap(heatmap_percent_no, ax=axes[1], cmap='YlOrRd', annot= True, cbar=False)

# Set titles
axes[0].set_title('Born in country')
axes[1].set_title('Not born in country')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
diff_data = heatmap_percent_yes - heatmap_percent_no

# Plot the difference heatmap
sns.heatmap(diff_data, cmap="coolwarm", annot = True, center=0)

## Minimum age of arrival

Now I want to see how age of arrival affects average score. I need to first clean the minagearrival column - adding 'NA' for those who were born in country

In [9]:
pivot_minage_experience = df.pivot_table(values='experienceavg', index=['Country'],columns=['MINAGEARRIVAL'],dropna = False)

In [10]:
pivot_minage_experience

MINAGEARRIVAL,-1.0,0.0,3.0,6.0,8.0
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Austria,3.427577,3.279088,3.215143,3.30058,3.10298
Egypt,3.195749,3.125283,3.009965,3.203571,3.070602
France,3.629238,3.617415,3.549436,3.528798,3.619111
Germany,3.512689,3.402297,3.29558,3.326879,3.461111
Iran,,,,,
Jordan,3.492736,3.476802,3.440322,3.442966,3.376955
Netherlands,3.460112,3.3625,3.34375,3.387654,3.59
Sweden,3.544466,3.458985,3.395793,3.317535,3.232733
Turkey,,,,,


In [11]:
# Step 2: Calculate the mean score for each country
country_mean_scores = pivot_minage_experience.groupby(level='Country').mean()

In [12]:
country_mean_scores

MINAGEARRIVAL,-1.0,0.0,3.0,6.0,8.0
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Austria,3.427577,3.279088,3.215143,3.30058,3.10298
Egypt,3.195749,3.125283,3.009965,3.203571,3.070602
France,3.629238,3.617415,3.549436,3.528798,3.619111
Germany,3.512689,3.402297,3.29558,3.326879,3.461111
Iran,,,,,
Jordan,3.492736,3.476802,3.440322,3.442966,3.376955
Netherlands,3.460112,3.3625,3.34375,3.387654,3.59
Sweden,3.544466,3.458985,3.395793,3.317535,3.232733
Turkey,,,,,


In [13]:
# If you want to sum the mean scores across all assessment columns (if there are multiple), you can do:
country_mean_scores['MeanTotal'] = country_mean_scores.mean(axis=1)

In [14]:
country_mean_scores

MINAGEARRIVAL,-1.0,0.0,3.0,6.0,8.0,MeanTotal
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Austria,3.427577,3.279088,3.215143,3.30058,3.10298,3.265073
Egypt,3.195749,3.125283,3.009965,3.203571,3.070602,3.121034
France,3.629238,3.617415,3.549436,3.528798,3.619111,3.5888
Germany,3.512689,3.402297,3.29558,3.326879,3.461111,3.399711
Iran,,,,,,
Jordan,3.492736,3.476802,3.440322,3.442966,3.376955,3.445956
Netherlands,3.460112,3.3625,3.34375,3.387654,3.59,3.428803
Sweden,3.544466,3.458985,3.395793,3.317535,3.232733,3.389902
Turkey,,,,,,


In [15]:
# Sort countries by total values
sorted_countries_means = country_mean_scores.sort_values(by='MeanTotal', ascending=False).index

In [16]:
sorted_countries_means

Index(['France', 'Jordan', 'Netherlands', 'Germany', 'Sweden', 'Austria',
       'Egypt', 'Iran', 'Turkey'],
      dtype='object', name='Country')

In [None]:


# Get the current index as a DataFrame for sorting
index_df = pivot_minage_experience.index.to_frame()

# Rename the columns to avoid conflict
index_df = index_df.rename(columns={'Country': 'Country_'})

# Sort the DataFrame first by Country using the sorted_countries and then by ASBH02A using the custom sorter
index_df['Country_'] = pd.Categorical(index_df['Country_'], categories=sorted_countries_means, ordered=True)
##index_df['ASBH02A_'] = pd.Categorical(index_df['ASBH02A_'], categories=asbh02a_sorter, ordered=True)
index_df = index_df.sort_values(by=['Country_'])

# Reindex the pivot table using the sorted index
sorted_pivot_df = pivot_minage_experience.loc[index_df.index]

In [None]:
# Plot the heat map
plt.figure(figsize=(10, 8))
sns.heatmap(sorted_pivot_df, annot=True, cmap='YlGnBu')
plt.title('Mean score in School Heat Map Sorted by Country and age of arrival in the country')
plt.show()

Next steps - comparisons with bench marks - through line? 
Should compare reading scores with the benchmark and can also compare with previous years.
If I get data from different dates maybe I can practice the time/date stuff. change asbh02a to immigrant/ non - immigrant