In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
df= pd.read_excel('cleandata99.xlsx', index_col=0)

I want to check all the unique values for each of the columns in my data set so that they are uniform (not mixed, strings with floats) and to remove any values that are not useful. 

In [3]:
# Apply unique() to each column
unique_values = df.apply(lambda col: col.unique())

print(unique_values)

Country          [Turkey, Austria, Germany, Egypt, France, Iran...
IDSTUD           [50010601, 50010602, 50010603, 50010604, 50010...
ASBH02A                                            [1, 2, Yes, No]
ASBH02B                                  [nan, 2.0, 1.0, 3.0, 4.0]
ASBH03A                                    [1, 2, 9, Yes, No, nan]
ASBH04           [1, 2, 9, 4, 3, Always, Never, Almost always, ...
ASBH15A          [99, 5, 2, 4, 7, 1, 6, 3, 8, 10, 9, <Master’s ...
ASBH15B          [7, 3, 99, 6, 4, 5, 8, 2, 1, 10, 9, <Master’s ...
ASBH16           [6, 3, 5, 4, 9, 2, 1, Finish <Short-cycle tert...
ASBH17A          [99, 1, 3, 4, 10, 12, 11, 8, 2, 7, 6, 5, 9, Co...
ASBH17B          [9, 2, 99, 5, 3, 6, 12, 4, 10, 8, 11, 7, 1, Co...
ASBH18AA                                [9, 1, 2, 6, Yes, No, nan]
ASBH18AB                                [1, 9, 2, 6, Yes, No, nan]
ASBG01                             [2, 1, nan, Girl, Boy, <Other>]
ASBG03           [1, 3, 2, 9, 4, nan, I always speak <language

I am grouping the columns into 5 groups for better understanding of the data. These are: identifier_columns, demographic_info_columns, positive_feelings_in_school, negativeexperience_in_school_columns, assessment_score_columns.

In [4]:
identifier_columns = ['Country','ASBH02A']
demographic_info_columns = ['ASBH02B', 'ASBH03A', 'ASBH04', 'ASBH15A', 'ASBH15B', 'ASBH16', 'ASBH17A', 'ASBH17B', 'ASBH18AA', 'ASBH18AB', 'ASBG01', 'ASBG03', 'ASDAGE','MINAGEARRIVAL' ]
positive_feelings_in_school = ['ASBG10A', 'ASBG10B','ASBG10C', 'ASBG10D', 'ASBG10E', 'ASBG10F']
negativeexperience_in_school_columns = ['ASBG11A', 'ASBG11B', 'ASBG11C', 'ASBG11D', 'ASBG11E', 'ASBG11F', 'ASBG11G', 'ASBG11H', 'ASBG11I', 'ASBG11J']
assessment_score_columns = ['ASRREA01', 'ASRREA02', 'ASRREA03', 'ASRREA04', 'ASRREA05', 'ASRLIT01', 'ASRLIT02', 'ASRLIT03', 'ASRLIT04', 'ASRLIT05', 'ASRINF01', 'ASRINF02', 'ASRINF03', 'ASRINF04', 'ASRINF05', 'ASRIIE01', 'ASRIIE02', 'ASRIIE03', 'ASRIIE04', 'ASRIIE05', 'ASRRSI01', 'ASRRSI02', 'ASRRSI03', 'ASRRSI04', 'ASRRSI05']

All the negativeexperience_in_school_columns are rated on the same scale. So I want to remove all the string values and assign them their associated integer value. 

In [5]:
ordinal_mapping_frequency = {
    'At least once a week': 1,
    'Once or twice a month': 2,
    'A few times a year': 3,
    'Never': 4
}

In [7]:
for column in negativeexperience_in_school_columns:
    df[column]= df[column].map(ordinal_mapping_frequency)

In [None]:
pd.set_option('display.max_colwidth', None)  # Optional: Set max column width
pd.set_option('display.expand_frame_repr', False)  # Optional: Do not wrap line

In [8]:
# Apply unique() to each column
unique_values_school_experience = df[negativeexperience_in_school_columns].apply(lambda col: col.unique())

print(unique_values_school_experience)

   ASBG11A  ASBG11B  ASBG11C  ASBG11D  ASBG11E  ASBG11F  ASBG11G  ASBG11H  \
0      NaN      NaN      NaN      NaN      NaN      NaN      NaN      NaN   
1      4.0      4.0      3.0      2.0      4.0      4.0      4.0      3.0   
2      2.0      2.0      1.0      4.0      3.0      3.0      2.0      4.0   
3      3.0      3.0      2.0      3.0      2.0      2.0      3.0      1.0   
4      1.0      1.0      4.0      1.0      1.0      1.0      1.0      2.0   

   ASBG11I  ASBG11J  
0      NaN      NaN  
1      4.0      4.0  
2      1.0      3.0  
3      3.0      1.0  
4      2.0      2.0  


All the positive_feelings_in_school are rated on the same scale. So I want to remove all the string values and assign them their associated integer value.

In [9]:
ordinal_mapping_agreement = { 
    'Agree a lot': 1,
    'Agree a little': 2,
    'Disagree a little': 3,
    'Disagree a lot': 4
}

In [10]:
for column in positive_feelings_in_school:
    df[column]= df[column].map(ordinal_mapping_agreement)

In [11]:
# Apply unique() to each column
unique_values_school_feeling = df[positive_feelings_in_school].apply(lambda col: col.unique())

print(unique_values_school_feeling)

   ASBG10A  ASBG10B  ASBG10C  ASBG10D  ASBG10E  ASBG10F
0      NaN      NaN      NaN      NaN      NaN      NaN
1      2.0      3.0      2.0      2.0      3.0      1.0
2      1.0      2.0      1.0      1.0      1.0      2.0
3      3.0      1.0      3.0      3.0      2.0      3.0
4      4.0      4.0      4.0      4.0      4.0      4.0


## Not sure I want to do the below

This is to assign the 'ASBH02A', 1.0s with Yes meaning 'born in country'.

In [12]:
# Mapping dictionary
update_dict2 = {1.0: 'Yes',2.0:'No' }

# Updating the column using map
df['ASBH02A'] = df['ASBH02A'].map(update_dict2).fillna(df['ASBH02A'])
df['ASBH02A'].value_counts()

Yes    38475
No      2890
Name: ASBH02A, dtype: int64

In [13]:
df.to_excel('cleandata109.xlsx')

In [None]:
# Updating the column using map
df['ASBH03A'] = df['ASBH03A'].map(update_dict2).fillna(df['ASBH03A'])
df['ASBH03A'].value_counts()