## Setup and data loading 


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import matplotlib as mpl
import re
import copy





In [2]:
#headers setting
colnames = ["Tijdstempel","programme","mlcourse","ircourse","statcourse","dbcourse","gender","chatGPT","birthday","studentestimate",
            "stand","stress","sporthours","random","bedtime","goodday1","goodday2"]
dataset = pd.read_csv("./dataset/ODI-2024.csv", skiprows=3, names=colnames)
dataset.drop("Tijdstempel", axis=1, inplace=True)
dataset["mlcourse"] = dataset["mlcourse"].astype("category")
dataset["ircourse"] = dataset["ircourse"].astype("category")
dataset["statcourse"] = dataset["statcourse"].astype("category")
dataset["gender"] = dataset["gender"].astype("category")
dataset["chatGPT"] = dataset["chatGPT"].astype("category")
dataset["stand"] = dataset["stand"].astype("category")
backup = copy.deepcopy(dataset)

def debugger(key,key2=None):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
        if key2:
            for edited, original in zip(dataset[key], backup[key2]):
                print(f"{original} => {edited}")
        else:
            for a in dataset[key]:
                print(f"{a}")

#dataset.head()
original_number=len(dataset)


## Data cleaning

In [3]:
#program generalize
def generalizeProgramme(x):
    if "bioinformatics" in x or "bisb" in x:
        return "bioinformatics and systems biology"
    if "econometrics" in x:
        return "econometrics and data science"
    if "fintech" in x:
        return "finiancial technology"
    if "computational science" in x:
        return "computational science"
    if "human language technology" in x:
        return "human language technology"
    if "business analytics" in x:
        return "business analytics"
    if "computational finance" in x:
        return "computational finance"
    if "big data" in x:
        if "engineering" in x:
            return "big data engineering"
        else:
            return "big data"
    if "political data journalism" in x:
        return "political data journalism"
    if "quantitative risk management" in x:
        return "quantitative risk management"
    if "software engineering" in x:
        return "software engineering"
    if "cls" in x:
        return "critical language scholarship"
    if " ai " in x or "artificial intelligence" in x:
        if "health" in x:
            return "ai for health"
        else:
            return "artificial intelligence"
    if " cs " in x or "computer science" in x:
        return "computer science"
    if "ba" in x:
        return "bachelor of arts"
    if "mpa" in x:
        return "public administration"
    

dataset["programme"] = dataset["programme"].map(lambda x: x.lower())
dataset["programme"] = dataset["programme"].map(lambda x: f" {x} ")
dataset["programme"] = dataset["programme"].map(generalizeProgramme)
dataset['programme'] = dataset['programme'].fillna('unknown')
dataset["programme"] = dataset["programme"].astype("category")
backup = copy.deepcopy(dataset)
dataset.head()
print("dropped: ",original_number-len(dataset))


dropped:  0


In [4]:
#stress
tmp = copy.deepcopy(dataset)
def stress_outlier(dataset):
    # Calculate the interquartile range (IQR)
    Q1 = dataset['stress'].quantile(0.25)
    Q3 = dataset['stress'].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define the lower and upper bounds for outlier detection
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter out the outliers
    outliers = dataset[(dataset['stress'] < lower_bound) | (dataset['stress'] > upper_bound)]
    
    # Calculate the average stress level excluding the outliers
    non_outlier_data = dataset[(dataset['stress'] >= lower_bound) & (dataset['stress'] <= upper_bound)]
    avg_stress_without_outliers = non_outlier_data['stress'].mean()
    
    # Replace the outliers with the calculated average
    dataset.loc[outliers.index, 'stress'] = avg_stress_without_outliers
    
    return dataset
    
def categorize_stress_level(stress):
    if stress >=80:
        return 'high'
    elif stress >= 40:
        return 'medium'
    else:
        return 'low'
# Apply the function to replace outliers with the average calculated without outliers
dataset = stress_outlier(dataset)
dataset['stress_level'] = dataset['stress'].apply(categorize_stress_level)
#debugger('stress','stress')

print("dropped: ",original_number-len(dataset))


dropped:  0


In [5]:
#print(dataset.columns)
#sporthours
dataset['sporthours'] = dataset['sporthours'].astype(str).str.extract('(\d+)').astype(float).fillna(0).astype(int)
#----
# Calculate Z-score
mean = dataset['sporthours'].mean()
std = dataset['sporthours'].std()
dataset['z_score'] = (dataset['sporthours'] - mean) / std

# Replace extreme outliers with -1
threshold = 3
dataset.loc[dataset['z_score'].abs() > threshold, 'sporthours'] = max(mean - 1,0) 

# Drop the 'z_score' column if it's no longer needed
dataset.drop(columns=['z_score'], inplace=True)
#-----

#student estimate
dataset['studentestimate'] = pd.to_numeric(dataset['studentestimate'], errors='coerce').fillna(0)
dataset['studentestimate'] = dataset['studentestimate'].apply(lambda x: round(x / 20) * 20)
outlier_threshold = 600
# Filter out row with 'studentestimate' values beyond the outlier threshold
dataset = dataset[dataset['studentestimate'] <= outlier_threshold]
dataset['random'] = pd.to_numeric(dataset['random'], errors='coerce').fillna(0)

# ircourses
dataset['ircourse'] = dataset['ircourse'].str.replace('0','no').str.replace('1','yes')


# Estimate
def clean_and_convert_estimate(value):
    # Remove non-numeric characters and spaces
    cleaned_value = re.sub(r'[^\d-]+', '', str(value))
    
    # If the cleaned value contains a hyphen, split it into two numbers
    if '-' in cleaned_value:
        start, end = cleaned_value.split('-')
        try:
            return (int(start) + int(end)) // 2  # Return the average of the range
        except ValueError:
            return None  # Return None if the range cannot be converted
    else:
        # Convert to integer
        try:
            return int(cleaned_value)
        except ValueError:
            return None  # Return None if the value cannot be converted to an integer

dataset['studentestimate'] = dataset['studentestimate'].map(clean_and_convert_estimate)
#debugger('studentestimate','studentestimate')
#dataset.head()
print("dropped: ",original_number-len(dataset))

dropped:  2


  dataset.loc[dataset['z_score'].abs() > threshold, 'sporthours'] = max(mean - 1,0)


In [6]:
#bedtime

def clean_time(x):
    if pd.isna(x):
        return np.nan
    x = str(x)
    patterns = [
        r'(\d{1,2}:\d{2})',            # hh:mm format
        r'(\d{1,2}(\.\d{2})?\s*[ap]m)', # hh[.mm] am/pm format
        r'(\d{1,2}\s*(am|pm))',         # hh am/pm format
        r'(\d{1,2})'                    # hh format
    ]
    # Search for time value in the string using each pattern
    for pattern in patterns:
        match = re.search(pattern, x, flags=re.IGNORECASE)
        if match:
            val = match.group(1).split(':')[0].replace('24','00')
            val = val if val != '24' else '0'
            #print(val, val == '24', val == 24)
            return val
    #return np.nan
    return "0"

# Function to format time values as hh:mm
def format_time(x):
    if pd.isna(x):
        return np.nan
    parts = x.split(':')
    if len(parts) == 1:  # If only hour is provided
        hour = parts[0].zfill(2)
        if (int(hour) >= 9) & (int(hour) <= 12): # If pm time is provided
            hour = str(int(hour) + 12)
        return f"{hour}"
    elif len(parts) == 2:  # If both hour and minute are provided
        hour = parts[0].zfill(2)
        if (int(hour) >= 9) & (int(hour) <= 12): # If pm time is provided
            hour = str(int(hour) + 12)
        minute = parts[1].zfill(2)
        return f"{hour}"
    else:
        return np.nan


dataset['bedtime'] = dataset['bedtime'].astype(str).str.replace('AM', '').str.replace('PM', '').str.replace('s morgens', '').str.replace('am', '').str.replace('pm', '')
dataset['bedtime'] = dataset['bedtime'].apply(clean_time)
dataset['bedtime'] = dataset['bedtime'].apply(format_time)
def format_time(time):
    if '24' in time:
        return time.replace('24', '00')
    return time

dataset['bedtime'] = dataset['bedtime'].apply(format_time)

#debugger('bedtime','bedtime')
print("dropped: ",original_number-len(dataset))



dropped:  2


In [7]:
#birthday
def year(x):#losing 2 real years rows
    patterns = [
        r'((\d{4,8}))',
        r'\b(\d{4})\b'
    ]
    for pattern in patterns:
        match = re.search(pattern, x, flags=re.IGNORECASE)
        if match:
            val = str(match.group(1))
            if len(val) > 4:
                val = val[len(match.group(1))-4:]
            if 80 <= int(val[-2:]) <= 99:
                val = '19' + val[-2:]
                
            return val if int(val) >= 1980 and int(val) <= 2003 else np.nan
        return np.nan
    
def month(x):#loses 3
    x = str(x)
    patterns = [
        r'((\d{6,8}))',
        r'(\d{1,4}[-,\/,\.,\ ]\d{1,2})'        
    ]
    # Search for time value in the string using each pattern
    map = {
        "jan": "01",    "feb": "02",    "mar": "03",
        "apr": "04",    "may": "05",    "jun": "06",
        "jul": "07",    "aug": "08",    "sep": "09",
        "oct": "10",    "nov": "11",    "dec": "12"}

    for y in map:
        if y in x.lower():
            return map[y]

    for pattern in patterns:
        match = re.search(pattern, x, flags=re.IGNORECASE)
        if match:
            if(len(str(match.group(1))) !=8):
                val=match.group(1)[len(match.group(1)) -2:].replace('/','').replace('.','').replace('-','')
                val2=match.group(1)[:2].replace('/','').replace('.','').replace('-','')
                final = str(val) if int(val)<=12 and int(val) >=1 else str(val2)
            else:
                val= match.group(1)[2:4]
                val2= match.group(1)[0:2]
                final = str(val) if int(val)<=12 and int(val) >=1 else str(val2)
            #print(f"{final} {final[0]} {final if final[0] != '0' else final[1:]}")
            return final if final[0] != '0' else final[1:]

    return np.nan
    
dataset['byear'] = dataset['birthday'].apply(year)
dataset['bmonth'] = dataset['birthday'].apply(month)


#Byear data filling
dataset['byear'] = pd.to_numeric(dataset['byear'], errors='coerce')

# Calculate the average birth year for each programme and 
#Update missing 'byear' values based on the average birth year of students within the same 'programme'
average_byear_by_programme = dataset.groupby('programme')['byear'].median().round().astype('Int64')
for index, row in dataset.iterrows():
    if pd.isna(row['byear']):
        programme = row['programme']
        avg_byear = average_byear_by_programme[programme]
        dataset.at[index, 'byear'] = avg_byear

#fill the only one without the byear with a plausible one and converts all of them to int
dataset['byear'] = pd.to_numeric(dataset['byear'], errors='coerce').fillna(2000).astype(int)

#debugger('byear','birthday')
#debugger('bmonth','birthday')
backup=copy.deepcopy(dataset)

  average_byear_by_programme = dataset.groupby('programme')['byear'].median().round().astype('Int64')


In [8]:
dataset.to_csv('./dataset/normalized.csv', index=False)


# Feature engineering

Grouping them <into higher level product categories (e.g. a Pizza Margherita and Pizza Quattro Formaggi)> cit.
With this code, we categorize
1. "sporthours" (activity_intensity) feature into three levels of intensity: low, moderate, and high
2. "bedtime" (bedtime_segment) feature into 6 levels Latenight, early morning, morning, afternoon, evening and night
3. "knowledge" sums the values(positive/negative/unknown) of mlcourse, ircourse, statcourse, dbcourse
4. "mental_age" based on the byear + stress + sportshours we find the "mental" age of a student. "mental_biological_age" have match,younger,older based on the biological age and mental age cause college kills your brain

In [9]:
#sport hours grouping
def categorize_intensity(hours):
    if hours < 3:
        return 'low'
    elif hours >= 3 and hours < 7:
        return 'moderate'
    else:
        return 'high'
dataset['activity_intensity'] = dataset['sporthours'].apply(categorize_intensity)

In [10]:
#bedtime grouping
def categorize_bedtime(bedtime):
    hour = int(bedtime.split(':')[0])
    if hour >= 23 or hour < 4:
        return 'Late Night'
    elif hour >= 4 and hour < 8:
        return 'Early Morning'
    elif hour >= 8 and hour < 12:
        return 'Morning'
    elif hour >= 12 and hour < 15:
        return 'Afternoon'
    elif hour >= 15 and hour < 20:
        return 'Evening'
    else:
        return 'Night'
dataset['bedtime_segment'] = dataset['bedtime'].apply(categorize_bedtime)

In [11]:
# Knowledge in datamining helpfull courses
def map_to_score(value):
    multiplier = 2 #to increase the toatl difference for graphs purpose
    if value == 'yes' or value == 'mu' or value == 'mu':
        return 2 * multiplier
    elif value == 'no' or value == 'sigma':
        return 0
    else:
        return 1 * multiplier

# Sum up the scores from the columns
dataset['knowledge'] = dataset[['mlcourse', 'ircourse', 'statcourse', 'dbcourse']].apply(lambda row: sum(map(map_to_score, row)), axis=1)

In [12]:
#byear + stress + sportshours to find the "mental" age of a student
#mental_age = (age + stress/10) - sporthours/5
current_year = pd.Timestamp.now().year
dataset['age'] = (current_year-dataset['byear'])
dataset['mental_age'] = (current_year-dataset['byear']) + (dataset['stress'] / 10) - (dataset['sporthours'] / 5)
dataset['mental_age'] = dataset['mental_age'].round().astype(int)

# Create the 'mental_biological_age' column based on the relationship between 'age' and 'mental_age'
def determine_mental_biological_age(row):
    if row['age'] == row['mental_age']:
        return 'match'
    elif row['age'] > row['mental_age']:
        return 'younger'
    else:
        return 'older'

dataset['mental_biological_age'] = dataset.apply(determine_mental_biological_age, axis=1)
age_counts = dataset['mental_biological_age'].value_counts()

# Display the counts
print("Number of 'match':", age_counts.get('match', 0))
print("Number of 'younger':", age_counts.get('younger', 0))
print("Number of 'older':", age_counts.get('older', 0))
print("University (*masters*) shortens your life, we knew that.")

#debugger('mental_biological_age','byear')

Number of 'match': 28
Number of 'younger': 33
Number of 'older': 182
University (*masters*) shortens your life, we knew that.


In [13]:
#dataset columns dropping and reordering
dataset.drop(columns=['goodday1', 'goodday2','birthday','stand','studentestimate' ], inplace=True)
desired_columns = ['programme', 'mlcourse', 'ircourse', 'statcourse', 'dbcourse', 'knowledge', 'stress','stress_level' ,'bedtime', 'bedtime_segment', 'sporthours', 'activity_intensity', 'byear', 'age', 'mental_age', 'mental_biological_age', 'gender', 'chatGPT', 'random', 'bmonth']

# Reindex the DataFrame with the desired column order
dataset = dataset.reindex(columns=desired_columns)

In [14]:
dataset.to_csv('./dataset/feature_engineering.csv', index=False)

# # Training set and testing set creation

In [15]:
# Split the data into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Save the training and testing datasets to separate CSV files
train_data.to_csv('./dataset/train_dataset.csv', index=False)
test_data.to_csv('./dataset/test_dataset.csv', index=False)

print("Train dataset saved as 'train_dataset.csv'")
print("Test dataset saved as 'test_dataset.csv'")

Train dataset saved as 'train_dataset.csv'
Test dataset saved as 'test_dataset.csv'
