In [1]:
import pandas as pd
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('Data_cleaned.csv')

In [3]:
data.head()

Unnamed: 0,Are you still going to school?,Do you have any other children living in your house with you?,How many people live in your home with you (including adults)?,What year are you in now?,Gender,1. What did you eat for breakfast YESTERDAY?,2. Did you eat any fruit and vegetables YESTERDAY?,3. How many times did you brush your teeth YESTERDAY?,"6. In the last 7 days, how many days did you do sports or exercise for at least 1 hour in total. This includes doing any activities (including online activities) or playing sports where your heart beat faster, you breathed faster and you felt warmer?","7. In the last 7 days, how many days did you watch TV/play online games/use the internet etc. for 2 or more hours a day (in total)?",...,"24. Remember, there are no right or wrong answers, just pick which is right for you. [I worry when I am at school]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I get very angry]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I lose my temper]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I hit out when I am angry]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I do things to hurt people]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I am calm]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I break things on purpose]","25. Are you able to keep in touch with your family that you don't live with? (grand parents, Uncle, Aunt, Cousins, etc)",26. Are you able to keep in touch with your friends?,hours of sleep had last night
0,"No, I am at home",Yes,4,Year 5,Boy,Toast,2,2,7 days,7 days,...,Never,Never,Never,Never,Sometimes,Always,Never,9,Yes,8.5
1,"No, I am at home",Yes,4,Year 4,Girl,"Healthy cereal e.g. porridge, weetabix, readyb...",1,2,7 days,7 days,...,Never,Never,Never,Sometimes,Never,Sometimes,Never,Yes,Yes,10.0
2,"No, I am at home",Yes,5,Year 5,Boy,Chocolate finger,2 Or More Fruit and Veg,2,7 days,7 days,...,Never,Never,Never,Sometimes,Never,Sometimes,Never,Yes,Yes,8.0
3,"No, I am at home",Yes,4,Year 4,Boy,Toast,2 Or More Fruit and Veg,2,7 days,1-2 days,...,Never,Never,Never,Never,Never,Sometimes,Never,Yes,Yes,12.0
4,"No, I am at home",Yes,5,Year 4,Girl,"Healthy cereal e.g. porridge, weetabix, readyb...",2 Or More Fruit and Veg,1,7 days,1-2 days,...,Never,Never,Never,Never,Never,Always,Never,Yes,Yes,11.0


### Encode the categorical columns

In [4]:
# Define columns related to "Me and My Feelings" (Q24)
me_and_my_feelings_columns = [
    col for col in data.columns if "24. Remember, there are no right or wrong answers" in col
]



In [5]:
# Custom label encoding for "Me and My Feelings"
for column in me_and_my_feelings_columns:
    if "I am calm" in column:  # Reverse scoring for "I am calm"
        reverse_mapping = {"Never": 2, "Sometimes": 1, "Always": 0}
        data[column] = data[column].map(reverse_mapping)
    else:  # Standard scoring for other items
        standard_mapping = {"Never": 0, "Sometimes": 1, "Always": 2}
        data[column] = data[column].map(standard_mapping)



In [6]:
# Standard label encoding for other categorical variables
# Identify all remaining categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns.difference(me_and_my_feelings_columns)



In [7]:
# Apply standard label encoding
label_encoder = LabelEncoder()
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column].astype(str))



In [8]:
# Save the encoded dataset to a new CSV file
data.to_csv('Data_Encoded.csv', index=False)

print(f"Encoded dataset saved")


Encoded dataset saved


### Calculating Emotional Difficulties subscale and Behavioural Difficulties subscale

In [9]:
# Define emotional difficulty columns from "Me and My Feelings" Questionnaire
emotional_difficulty_columns = [
    col for col in me_and_my_feelings_columns if any(keyword in col for keyword in [
        "I feel lonely", "I cry a lot", "I worry a lot", "I feel nobody likes me", "I am unhappy"
    ])
]

In [10]:
# Define behavioral difficulty columns from "Me and My Feelings" Questionnaire
behavioral_difficulty_columns = [
    col for col in me_and_my_feelings_columns if any(keyword in col for keyword in [
        "I lose my temper", "I do things to hurt people", "I hit out when I am angry",
        "I get very angry", "I break things on purpose", "I worry when I am at school", "I am shy"
    ])
]



In [11]:
# Calculate emotional difficulty subscale
data['emotional difficulty subscale'] = data[emotional_difficulty_columns].sum(axis=1)



In [12]:
# Calculate behavioral difficulty subscale
data['behavioral difficulty subscale'] = data[behavioral_difficulty_columns].sum(axis=1)



In [13]:
# Verify the new columns in the dataset
new_columns_preview = data[['emotional difficulty subscale', 'behavioral difficulty subscale']].head()

In [14]:
new_columns_preview

Unnamed: 0,emotional difficulty subscale,behavioral difficulty subscale
0,1,2
1,3,2
2,0,2
3,2,1
4,1,0


### Score categorization (cut-offs):

In [15]:
# Categorize 'emotional difficulty subscale'
def categorize_emotional_difficulty(score):
    if score <= 9:
        return 'expected'
    elif score in [10, 11]:
        return 'borderline difficulty'
    else:
        return 'significant difficulty'

In [16]:
# Categorize 'behavioral difficulty subscale'
def categorize_behavioral_difficulty(score):
    if score <= 5:
        return 'expected'
    elif score == 6:
        return 'borderline difficulty'
    else:
        return 'significant difficulty'

In [17]:
# Apply categorization to the dataset
data['emotional difficulty category'] = data['emotional difficulty subscale'].apply(categorize_emotional_difficulty)
data['behavioral difficulty category'] = data['behavioral difficulty subscale'].apply(categorize_behavioral_difficulty)


In [18]:
# Verify the new categorized columns
categorized_columns_preview = data[['emotional difficulty category', 'behavioral difficulty category']].head()

In [19]:
categorized_columns_preview

Unnamed: 0,emotional difficulty category,behavioral difficulty category
0,expected,expected
1,expected,expected
2,expected,expected
3,expected,expected
4,expected,expected


In [20]:
data.head()

Unnamed: 0,Are you still going to school?,Do you have any other children living in your house with you?,How many people live in your home with you (including adults)?,What year are you in now?,Gender,1. What did you eat for breakfast YESTERDAY?,2. Did you eat any fruit and vegetables YESTERDAY?,3. How many times did you brush your teeth YESTERDAY?,"6. In the last 7 days, how many days did you do sports or exercise for at least 1 hour in total. This includes doing any activities (including online activities) or playing sports where your heart beat faster, you breathed faster and you felt warmer?","7. In the last 7 days, how many days did you watch TV/play online games/use the internet etc. for 2 or more hours a day (in total)?",...,"24. Remember, there are no right or wrong answers, just pick which is right for you. [I do things to hurt people]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I am calm]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I break things on purpose]","25. Are you able to keep in touch with your family that you don't live with? (grand parents, Uncle, Aunt, Cousins, etc)",26. Are you able to keep in touch with your friends?,hours of sleep had last night,emotional difficulty subscale,behavioral difficulty subscale,emotional difficulty category,behavioral difficulty category
0,1,1,3,2,0,122,2,2,4,4,...,1,0,0,0,1,8.5,1,2,expected,expected
1,1,1,3,1,1,55,0,2,4,4,...,0,1,0,2,1,10.0,3,2,expected,expected
2,1,1,4,2,0,21,3,2,4,4,...,0,1,0,2,1,8.0,0,2,expected,expected
3,1,1,3,1,0,122,3,2,4,1,...,0,1,0,2,1,12.0,2,1,expected,expected
4,1,1,4,1,1,55,3,1,4,1,...,0,0,0,2,1,11.0,1,0,expected,expected


In [21]:
# Save the dataset
data.to_csv('difficulties_categorized.csv')

#### Drop emotional difficulty subscale and behavioral difficulty subscale columns

In [22]:
# Drop the specified columns
data = data.drop(columns=['emotional difficulty subscale', 'behavioral difficulty subscale'], errors='ignore')

# Verify the updated dataset
data.head()


Unnamed: 0,Are you still going to school?,Do you have any other children living in your house with you?,How many people live in your home with you (including adults)?,What year are you in now?,Gender,1. What did you eat for breakfast YESTERDAY?,2. Did you eat any fruit and vegetables YESTERDAY?,3. How many times did you brush your teeth YESTERDAY?,"6. In the last 7 days, how many days did you do sports or exercise for at least 1 hour in total. This includes doing any activities (including online activities) or playing sports where your heart beat faster, you breathed faster and you felt warmer?","7. In the last 7 days, how many days did you watch TV/play online games/use the internet etc. for 2 or more hours a day (in total)?",...,"24. Remember, there are no right or wrong answers, just pick which is right for you. [I lose my temper]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I hit out when I am angry]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I do things to hurt people]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I am calm]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I break things on purpose]","25. Are you able to keep in touch with your family that you don't live with? (grand parents, Uncle, Aunt, Cousins, etc)",26. Are you able to keep in touch with your friends?,hours of sleep had last night,emotional difficulty category,behavioral difficulty category
0,1,1,3,2,0,122,2,2,4,4,...,0,0,1,0,0,0,1,8.5,expected,expected
1,1,1,3,1,1,55,0,2,4,4,...,0,1,0,1,0,2,1,10.0,expected,expected
2,1,1,4,2,0,21,3,2,4,4,...,0,1,0,1,0,2,1,8.0,expected,expected
3,1,1,3,1,0,122,3,2,4,1,...,0,0,0,1,0,2,1,12.0,expected,expected
4,1,1,4,1,1,55,3,1,4,1,...,0,0,0,0,0,2,1,11.0,expected,expected


#### Label encode the columns  'emotional difficulty category'	'behavioral difficulty category'

In [23]:
# from sklearn.preprocessing import LabelEncoder

# # Label encode the specified columns
# category_columns = ['emotional difficulty category', 'behavioral difficulty category']

# # Initialize the LabelEncoder
# label_encoder = LabelEncoder()

# # Apply label encoding
# for column in category_columns:
#     data[column] = label_encoder.fit_transform(data[column].astype(str))


In [24]:
# Define the mapping for the categories
category_mapping = {
    'expected': 0,
    'borderline difficulty': 1,
    'significant difficulty': 2
}

# List of columns to apply the mapping to
category_columns = ['emotional difficulty category', 'behavioral difficulty category']

# Apply the mapping to both columns
for column in category_columns:
    data[column] = data[column].map(category_mapping)

In [25]:
# Verify the encoding
data.head()

Unnamed: 0,Are you still going to school?,Do you have any other children living in your house with you?,How many people live in your home with you (including adults)?,What year are you in now?,Gender,1. What did you eat for breakfast YESTERDAY?,2. Did you eat any fruit and vegetables YESTERDAY?,3. How many times did you brush your teeth YESTERDAY?,"6. In the last 7 days, how many days did you do sports or exercise for at least 1 hour in total. This includes doing any activities (including online activities) or playing sports where your heart beat faster, you breathed faster and you felt warmer?","7. In the last 7 days, how many days did you watch TV/play online games/use the internet etc. for 2 or more hours a day (in total)?",...,"24. Remember, there are no right or wrong answers, just pick which is right for you. [I lose my temper]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I hit out when I am angry]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I do things to hurt people]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I am calm]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I break things on purpose]","25. Are you able to keep in touch with your family that you don't live with? (grand parents, Uncle, Aunt, Cousins, etc)",26. Are you able to keep in touch with your friends?,hours of sleep had last night,emotional difficulty category,behavioral difficulty category
0,1,1,3,2,0,122,2,2,4,4,...,0,0,1,0,0,0,1,8.5,0,0
1,1,1,3,1,1,55,0,2,4,4,...,0,1,0,1,0,2,1,10.0,0,0
2,1,1,4,2,0,21,3,2,4,4,...,0,1,0,1,0,2,1,8.0,0,0
3,1,1,3,1,0,122,3,2,4,1,...,0,0,0,1,0,2,1,12.0,0,0
4,1,1,4,1,1,55,3,1,4,1,...,0,0,0,0,0,2,1,11.0,0,0


#### Create datasets for Emotional difficulty and Behavioral difficulty

In [26]:
# Create Emotional Difficulty DataFrame by excluding 'behavioral difficulty category'
emotional_difficulty_df = data.drop(columns=['behavioral difficulty category'], errors='ignore')

# Create Behavioral Difficulty DataFrame by excluding 'emotional difficulty category'
behavioral_difficulty_df = data.drop(columns=['emotional difficulty category'], errors='ignore')

In [27]:
# Verify the new datasets
emotional_difficulty_df.head()

Unnamed: 0,Are you still going to school?,Do you have any other children living in your house with you?,How many people live in your home with you (including adults)?,What year are you in now?,Gender,1. What did you eat for breakfast YESTERDAY?,2. Did you eat any fruit and vegetables YESTERDAY?,3. How many times did you brush your teeth YESTERDAY?,"6. In the last 7 days, how many days did you do sports or exercise for at least 1 hour in total. This includes doing any activities (including online activities) or playing sports where your heart beat faster, you breathed faster and you felt warmer?","7. In the last 7 days, how many days did you watch TV/play online games/use the internet etc. for 2 or more hours a day (in total)?",...,"24. Remember, there are no right or wrong answers, just pick which is right for you. [I get very angry]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I lose my temper]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I hit out when I am angry]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I do things to hurt people]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I am calm]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I break things on purpose]","25. Are you able to keep in touch with your family that you don't live with? (grand parents, Uncle, Aunt, Cousins, etc)",26. Are you able to keep in touch with your friends?,hours of sleep had last night,emotional difficulty category
0,1,1,3,2,0,122,2,2,4,4,...,0,0,0,1,0,0,0,1,8.5,0
1,1,1,3,1,1,55,0,2,4,4,...,0,0,1,0,1,0,2,1,10.0,0
2,1,1,4,2,0,21,3,2,4,4,...,0,0,1,0,1,0,2,1,8.0,0
3,1,1,3,1,0,122,3,2,4,1,...,0,0,0,0,1,0,2,1,12.0,0
4,1,1,4,1,1,55,3,1,4,1,...,0,0,0,0,0,0,2,1,11.0,0


In [28]:
# Verify the new datasets
behavioral_difficulty_df.head()

Unnamed: 0,Are you still going to school?,Do you have any other children living in your house with you?,How many people live in your home with you (including adults)?,What year are you in now?,Gender,1. What did you eat for breakfast YESTERDAY?,2. Did you eat any fruit and vegetables YESTERDAY?,3. How many times did you brush your teeth YESTERDAY?,"6. In the last 7 days, how many days did you do sports or exercise for at least 1 hour in total. This includes doing any activities (including online activities) or playing sports where your heart beat faster, you breathed faster and you felt warmer?","7. In the last 7 days, how many days did you watch TV/play online games/use the internet etc. for 2 or more hours a day (in total)?",...,"24. Remember, there are no right or wrong answers, just pick which is right for you. [I get very angry]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I lose my temper]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I hit out when I am angry]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I do things to hurt people]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I am calm]","24. Remember, there are no right or wrong answers, just pick which is right for you. [I break things on purpose]","25. Are you able to keep in touch with your family that you don't live with? (grand parents, Uncle, Aunt, Cousins, etc)",26. Are you able to keep in touch with your friends?,hours of sleep had last night,behavioral difficulty category
0,1,1,3,2,0,122,2,2,4,4,...,0,0,0,1,0,0,0,1,8.5,0
1,1,1,3,1,1,55,0,2,4,4,...,0,0,1,0,1,0,2,1,10.0,0
2,1,1,4,2,0,21,3,2,4,4,...,0,0,1,0,1,0,2,1,8.0,0
3,1,1,3,1,0,122,3,2,4,1,...,0,0,0,0,1,0,2,1,12.0,0
4,1,1,4,1,1,55,3,1,4,1,...,0,0,0,0,0,0,2,1,11.0,0


In [29]:
# Save the dataframe for building a machine learning model for each.
emotional_difficulty_df.to_csv('emotional_difficulty.csv', index = False)

In [30]:
behavioral_difficulty_df.to_csv('behavioral_difficulty.csv', index = False)

In [31]:
# End