Load all data, drop unnecessary columns

In [1]:
import pandas as pd
import re
filepath_dataset_demographic = r"C:\Users\MichalinaJanik\PycharmProjects\coding-fairness\dataset\netsense\demsurveyMergedCodedDisID.csv"

all_df = pd.read_csv(filepath_dataset_demographic)

unique_ids = all_df['egoid'].nunique()
print(f"Number of unique IDs: {unique_ids}")


Number of unique IDs: 203


In [2]:
!pip install python-Levenshtein
#Drop name,lastname columns 
pattern = r'^v\d+$'
columns_to_drop = [col for col in all_df.columns if re.match(pattern, col)]
all_df.drop(columns=columns_to_drop, inplace=True)


#Divide rows to six rows (one row - one semester)

pattern_sem = r'_[1-6]$'


variable_columns = [col for col in all_df.columns if re.search(pattern_sem, col)]
constant_columns = [col for col in all_df.columns if not re.search(pattern_sem, col)]

#Rename columns
def adjust_column_names(col_name):
    new_col_name = re.sub(r'_(?!\d+$)', '', col_name)
    return new_col_name

all_df.rename(columns={col: adjust_column_names(col) for col in all_df.columns}, inplace=True)

def get_stubnames(co):
    stub_set = set()
    pattern = re.compile(r'(.*)_\d+$')
    for col in co:
        match = pattern.match(col)
        if match:
            stub_set.add(match.group(1))
    return list(stub_set)


stubnames = get_stubnames(variable_columns)

for stub in stubnames:
    for num in range(1, 7):  # Assuming survey numbers range from 1 to 6
        col_name = f"{stub}_{num}"
        if col_name in all_df.columns:
            all_df.drop_duplicates(subset=['egoid', col_name], inplace=True)

if 'egoid' not in all_df.index.names:
    all_df.set_index('egoid', inplace=True)
    
    
try:
    all_df.reset_index(inplace=True)
    all_df = pd.wide_to_long(all_df, stubnames, i='egoid', j='SurveyNr', sep='_', suffix=r'\d+').reset_index()

    
    all_df.reset_index(drop=True,inplace=True)
except Exception as e:
    print(f'Exception while  creating one row per semester {e}')


#REMOVE DUPLICATES
import pandas as pd

def remove_duplicate_columns(df):
    new_columns = {}
    for col in df.columns:
        if col not in new_columns:
            column_data = df[col]
            if isinstance(column_data, pd.DataFrame):
                column_data = column_data.iloc[:, 0]
            new_columns[col] = column_data.squeeze() 
    return pd.DataFrame(new_columns)


try:
    all_df = remove_duplicate_columns(all_df)
except Exception as e:
    print("Error:", e)
    

# Sort by SurveyNr and reset index for clarity
all_df.sort_values(by=['SurveyNr'], inplace=True)
all_df.reset_index(drop=True, inplace=True)
    
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from Levenshtein import distance 
import matplotlib.pyplot as plt
import numpy as np


column_names =all_df.columns.tolist()
dist_array = []
n = len(column_names)
for i in range(n):
    for j in range(i + 1, n):
        dist_array.append(distance(column_names[i], column_names[j]))

dist_matrix = np.array(dist_array)
linked = linkage(dist_matrix, 'single')
max_distance = 3  
clusters = fcluster(linked, max_distance, criterion='distance')
clustered_columns = {}
for label, cluster_id in zip(column_names, clusters):
    if cluster_id not in clustered_columns:
        clustered_columns[cluster_id] = []
    clustered_columns[cluster_id].append(label)


def longest_common_substring(lst):
    if not lst:
        return ""
    shortest = min(lst, key=len)
    for length in range(len(shortest), 0, -1):
        for start in range(len(shortest) - length + 1):
            substring = shortest[start:start + length]
            if all(substring in item for item in lst):
                return substring
    return ""

# Collecting data for DataFrame
cluster_data = []
for cluster_id, cols in clustered_columns.items():
    common_substr = longest_common_substring(cols)
    unique_values = {col: all_df[col].unique().tolist() for col in cols}
    cluster_data.append({
        "Cluster ID": cluster_id,
        "Column Common Substring": common_substr,
        "Columns": cols,
        "Data Types": [all_df[col].dtype for col in cols],
        "Possible Values": unique_values
    })

# Create DataFrame from list of dictionaries
clustered_cols_df = pd.DataFrame(cluster_data)


direct_student_features_col_descs = clustered_cols_df[clustered_cols_df['Columns'].apply(lambda x: len(x) <= 6)]
rel_student_features_col_descs = clustered_cols_df[clustered_cols_df['Columns'].apply(lambda x: len(x) > 6)]


values_to_remove = ['lastpageseen','completed', 'duplicate']
values_to_unique_id= ['egoid', 'SurveyNr']


direct_student_features_col_descs = direct_student_features_col_descs[
    ~direct_student_features_col_descs['Column Common Substring'].isin(values_to_remove)]


given_student_cols = list(set(col for sublist in direct_student_features_col_descs['Columns'] for col in sublist))


student_opinion_features = all_df[given_student_cols]


if {'egoid', 'SurveyNr'}.issubset(student_opinion_features.columns):
    # Set 'egoid' and 'SurveyNr' as a MultiIndex
    student_opinion_features.set_index(['egoid', 'SurveyNr'], inplace=True)


#FILTER OUT THE COLUMNS WHERE WE HAVE LESS THAN 80%PERCENT OF DATA
missing_percentage = student_opinion_features.isnull().mean() * 100
threshold = 20
columns_to_drop = missing_percentage[
    missing_percentage >= threshold].index  # series datatype which have te columns name as the index

student_opinion_features.drop(columns=columns_to_drop, inplace=True)
student_opinion_features.reset_index(inplace=True)


student_opinion_features.drop(columns=['merge_1', 'merge_2', 'merge_4'], inplace=True)


rename_mapping = {
    'interestitems1': 'enjoy_music',
    'interestitems2': 'enjoy_movies',
    'interestitems3': 'enjoy_books',
    'interestitems4': 'enjoy_follow_sports',
    'interestitems5': 'enjoy_games',
    'interestitems6': 'enjoy_outdoor'
}

student_opinion_features.rename(columns=rename_mapping, inplace=True)

col1 = student_opinion_features.columns.values


col_unique_vals= {}
for col in col1:
    values=student_opinion_features[col].unique()
    col_unique_vals[col] = values
    print(f"{col} - {values}\n")




egoid - [26425 55464 86352 22745 78638 34853 50869 46654 73303 74916 50947 52385
 24760 40877 38893 59367 60830 28831 96600 64772 48990 61280 22540 39122
 89827 86610 57637 10353 46160 45000 88133 75261 78966 83568 58168 40997
 63817 38927 66404 18394 71700 48429 19066 36951 75468 46021 50709 45945
 80193 16933 77824 53275 62560 51722 52923 32899 40629 24543 16313 99338
 66991 46771 83184 71782 45292 23144 84625 32174 52067 63757 82060 39783
 51102 60482 11692 44956 85970 39253 19844 67947 15776 25947 71627 41393
 10060 18707 80058 14962 70889 36006 79877 37617 78091 86727 40557 38901
 76313 53884 84215 33094 52441 66754 45539 86237 93944 25171 11360 66052
 48187 65591 49597 15723 78062 44520 13896 18420 29914 38823 82282 55594
 25494 51552 58909 68494 78911 84841 71187 22931 17353 13882 15548 15071
 14106 18344 63188 88175 72746 68633 23642 46584 25323 36143 39701 24853
 65526 44947 77023 11023 35227 30076 63063 71518 86009 47230 69065 16495
 21350 94652 26127 25544 23040 97900 92782 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  student_opinion_features.drop(columns=columns_to_drop, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  student_opinion_features.drop(columns=['merge_1', 'merge_2', 'merge_4'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  student_opinion_features.rename(columns=rename_mapping, inplace=True)


Mapping opinions to : 

The survey answers were collected on different scales, e.g., yes/no/not sure or a 7-level scale expressing the degree of
agreement with the given question. Model can recognize three states, so all the above questions were transformed to the
three-level scale: agree (0), disagree (1), not sure (2)



• When a person has a disease that cannot be cured, do you think doctors should be allowed by law to end the
patient’s life by some painless means if the patient and his family request it? (euthanasia)  


• Do you think federal spending on social security should increase, be kept the same, or decrease? (fssocsec)  


• Do you think federal spending on welfare should increase, be kept the same, or decrease? (fswelfare)  


• Some people feel that the government should see to it that every person has a job and a good standard of living. Others think the government should just let each person get ahead on her/their own. Where would you place
yourself on the 7 point scale? (jobguar)  

• Do you think the use of marijuana should be made legal or not? (marijuana)
• Do you agree that we have gone too far in pushing equal rights in this country?   (toomucheqrights)




In [12]:
student_opinion_features.to_csv('ForAnalysis_beforeEngineering.csv', index=False)

In [2]:
from sklearn.preprocessing import MinMaxScaler

#map wellbeing
wellbeing_mapping = {
    'all the time': 6,
    'most of the time': 5,
    'more than half the time': 4,
    'some of the time': 3,
    'less than half the time': 2,
    'at no time': 0,
    '6': 6  # Keeping '6' as it is because it's a number, not a string that needs case conversion
}

def map_(df, mapping, substring, default_value=0):
    for column in df.columns:
        if substring in column:
            df[column] = df[column].str.lower().map(mapping).fillna(default_value)
    return df


student_opinion_features = map_(
    student_opinion_features,
    wellbeing_mapping,
    'wellbeing'
    )

# Filter out the wellbeing columns
wellbeing_columns = [col for col in student_opinion_features.columns if 'wellbeing' in col]

# Calculate the Wellbeing Index as the sum of all wellbeing columns
student_opinion_features['Wellbeing_Index'] = student_opinion_features[wellbeing_columns].sum(axis=1)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Reshape data for scaling (scaler requires 2D array)
student_opinion_features['Wellbeing_Index'] = scaler.fit_transform(student_opinion_features[['Wellbeing_Index']])
 
 # Drop original wellbeing columns
student_opinion_features.drop(columns=wellbeing_columns, inplace=True)

# Display the final DataFrame
print(student_opinion_features)

      egoid  SurveyNr enjoy_movies  extraversion  \
0     26425         1    Very much         4.375   
1     55464         1    Very much         3.375   
2     86352         1     Somewhat         3.500   
3     22745         1    Very much         2.625   
4     78638         1    Very much         4.375   
...     ...       ...          ...           ...   
1213  51675         6    Very much         1.875   
1214  65102         6          NaN           NaN   
1215  84472         6          NaN           NaN   
1216  26425         6     Somewhat         3.250   
1217  71518         6          NaN           NaN   

                                               abortion  openness  \
0            By law, abortion should never be permitted       4.1   
1     Law should permited in other cases, but only a...       3.4   
2                                              Not sure       3.8   
3      By law, should always be able to obtain abortion       3.5   
4     Law should permited in o

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.lower().map(mapping).fillna(default_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.lower().map(mapping).fillna(default_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.lower().map(mapping).fillna(default

Encode all column values with 'enjoy' substring

In [3]:
enjoy_mapping = {
    'very much': 4,
    'somewhat': 3,
    'not that much': 2,
    'not at all': 1,
    'not sure': 0,
    '5': 4,
    None: np.nan  # Keeping NaN as NaN, can choose to fill these later
}

student_opinion_features = map_(
    student_opinion_features,
    enjoy_mapping,
    'enjoy_'
    )

# Calculate Intensity and Variety Scores


cols_enjoy = [col for col in student_opinion_features.columns.values if 'enjoy' in col]
student_opinion_features['Intensity Score'] = student_opinion_features[cols_enjoy].sum(axis=1)
student_opinion_features['Variety Score'] = student_opinion_features[cols_enjoy].apply(lambda x: (x > 2).sum(), axis=1)

from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler and normalize the scores
scaler = MinMaxScaler(feature_range=(0, 100))
student_opinion_features[['Normalized Intensity Score', 'Normalized Variety Score']] = scaler.fit_transform(student_opinion_features[['Intensity Score', 'Variety Score']])

# Define a function to classify engagement into numerical categories
def classify_person(row):
    if row['Normalized Intensity Score'] > 50 and row['Normalized Variety Score'] > 50:
        return 4  # Highly Engaged and Diverse
    elif row['Normalized Intensity Score'] > 50:
        return 3  # Focused Enthusiast
    elif row['Normalized Variety Score'] > 50:
        return 2  # Casual Participant
    else:
        return 1  # Disengaged

student_opinion_features['Engagement Profile'] = student_opinion_features.apply(classify_person, axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.lower().map(mapping).fillna(default_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.lower().map(mapping).fillna(default_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.lower().map(mapping).fillna(default

MAP BOOKSREAD TO A SCALE

In [4]:
#booksread 

df = student_opinion_features 
# Define bins for the categorization
# Define bins and labels (excluding -1 here)
bins = [-1, 0, 5, 15, 50, float('inf')]
codes = [0, 1, 2, 3, 4]

# Categorize and assign codes
df['reading_category'] = pd.cut(df['booksread'], bins=bins, labels=codes)

# Convert categorical column to integer, setting NaNs (which don't match any bin) to -1
df['reading_category'] = df['reading_category'].cat.add_categories([-1]).fillna(-1).astype(int)

# Print the DataFrame
print(df[['booksread', 'reading_category']])

      booksread  reading_category
0           1.0                 1
1           8.0                 2
2           4.0                 1
3          10.0                 2
4          30.0                 3
...         ...               ...
1213        5.0                 1
1214        NaN                -1
1215        NaN                -1
1216        2.0                 1
1217        NaN                -1

[1218 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reading_category'] = pd.cut(df['booksread'], bins=bins, labels=codes)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reading_category'] = df['reading_category'].cat.add_categories([-1]).fillna(-1).astype(int)


Map opinion string values to numbers

In [5]:
# Mappings for each category
mappings = {
    'homosexual': {
        'Always wrong': 0, 
        'Not wrong at all': 4, 
        'Not sure': 2, 
        'Sometimes wrong': 1, 
        'Almost always wrong': 0.5,
        np.nan: -1
    },
    'happy': {
        'Pretty happy': 3, 
        'Happy': 4, 
        'Very happy': 5, 
        'Not so happy': 2, 
        'Not sure': 1,
        'Very Happy': 5, 
        'Pretty Happy': 3, 
        'Not happy at all': 0, 
        '6': -1,  # Unclear category, treated as missing
        np.nan: -1
    },
    'political': {
        'Conservative': 2, 
        'Moderate': 3, 
        'Slightly conservative': 2.5, 
        'Not sure': 1, 
        'Slightly liberal': 3.5, 
        'Liberal': 4, 
        'Extremely conservative': 1, 
        'Extremely liberal': 5,
        np.nan: -1
    },
    'gaymarriage': {
        'Neither agree nor disagree': 2, 
        'Agree': 4, 
        'Strongly disagree': 0, 
        'Strongly agree': 5, 
        'Disagree': 1, 
        'Not Sure': 2.5,
        np.nan: -1
    },
    'fssocsec': {
        'Decrease': 1, 
        'Not sure': 2.5, 
        'Increase': 4, 
        'Be kept the same': 3,
        np.nan: -1
    },
    'health': {
        'Fair': 2, 
        'Excellent': 5, 
        'Good': 4, 
        'Poor': 1,
        np.nan: -1
    },
    'abortion': {
        "Law should permit only in rape, incest, or woman's life in danger": 3,
        'By law, abortion should never be permitted': 0,
        'By law, should always be able to obtain abortion': 5,
        'Law should be permitted in other cases, but only after need is established': 4,
        'Not sure': 2,
        np.nan: -1
    },
    'euthanasia': {
        'Yes': 5, 
        'No': 0, 
        'Not sure': 2.5,
        np.nan: -1
    },
    'marijuana': {
        'Not Legal': 0, 
        'Legal': 5, 
        'Not Sure': 2.5,
        np.nan: -1
    },
    'racediscrim': {
        'Strongly oppose': 5, 
        'Oppose': 4, 
        'Not sure': 2.5, 
        'Strongly favor': 0, 
        'Favor': 1,
        np.nan: -1
    },
    'deathpen': {
        'Favor': 5, 
        'Oppose': 0, 
        'Not sure': 2.5,
        np.nan: -1
    }
}

# Apply each mapping to the respective column in the DataFrame
for column, mapping in mappings.items():
    if column in df.columns:
        # Map the column and replace NaN values with -1
        df[column] = df[column].map(mapping).fillna(-1).astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].map(mapping).fillna(-1).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].map(mapping).fillna(-1).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].map(mapping).fillna(-1).astype(int)
A value is trying to be s

In [6]:
additional_mappings = {
    'toomucheqrights': {
        'Strongly agree': 5, 
        'Agree': 4, 
        'Neither agree nor disagree': 3, 
        'Disagree': 2, 
        'Strongly diisagree': 1,
        'Not Sure': 2.5,
        np.nan: -1
    },
    'eqchances': {
        'Strongly agree': 5,
        'Agree': 4,
        'Neither agree nor disagree': 3,
        'Disagree': 2,
        'Strongly diisagree': 1,
        'Not Sure': 2.5,
        np.nan: -1
    },
    'premaritalsex': {
        'Always wrong': 0, 
        'Almost always wrong': 1, 
        'Sometimes wrong': 2, 
        'Not wrong at all': 4,
        'Not sure': 2.5,
        np.nan: -1
    },
    'lesseq': {
        'Strongly agree': 5, 
        'Agree': 4, 
        'Neither agree nor disagree': 3,
        'Disagree': 2,
        'Strongly diisagree': 1,
        'Not Sure': 2.5,
        np.nan: -1
    },
    'parentsmarriage': {
        'Alive, but divorced or living apart': 2, 
        'Alive and living together': 3, 
        'One of them deceased': 1,
        np.nan: -1
    },
    'fswelfare': {
        'Decrease': 1, 
        'Not sure': 2.5, 
        'Increase': 4, 
        'Be kept the same': 3,
        np.nan: -1
    }
}

# Assuming 'df' is your DataFrame and it's already loaded with data
for column, mapping in additional_mappings.items():
    if column in df.columns:
        df[column] = df[column].map(mapping).fillna(-1).astype(int)

# Printing out the first few rows to check the transformed data
print(df.head())

   egoid  SurveyNr  enjoy_movies  extraversion  abortion  openness  \
0  26425         1           4.0         4.375         0       4.1   
1  55464         1           4.0         3.375        -1       3.4   
2  86352         1           3.0         3.500         2       3.8   
3  22745         1           4.0         2.625         5       3.5   
4  78638         1           4.0         4.375        -1       3.4   

   premaritalsex  euthanasia  homosexual  deathpen  ...  health  weight  \
0              0           0           0         0  ...       5   140.0   
1              4           2           4         0  ...       5   145.0   
2              0           2           0         2  ...       5   198.0   
3              2           5           4         0  ...       2   115.0   
4              0           5           0         0  ...       5   120.0   

   parentsmarriage  Wellbeing_Index  Intensity Score  Variety Score  \
0                3         0.866667             21.0     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].map(mapping).fillna(-1).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].map(mapping).fillna(-1).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].map(mapping).fillna(-1).astype(int)
A value is trying to be s

Group MAJOR

In [7]:
major_groups = {
    'STEM': [
        'Engineering: Computer Science & Engineering', 'Engineering: Civil Engineering', 
        'Engineering: Mechanical Engineering', 'Engineering: Electrical or Electronic Engineering', 
        'Engineering: Chemical Engineering', 'Engineering: Aeronautical or Astronautical Eng.', 
        'Engineering: Other Engineering', 'Physical Science: Physics', 'Physical Science: Chemistry', 
        'Physical Science: Mathematics'
    ],
    'Health and Life Sciences': [
        'Biological Sciences: Biology (general)', 'Biological Sciences: Biochemistry or Biophysics', 
        'Biological Sciences: Environmental Science', 'Professional: Medicine, Dentistry, Veterinary Medicine', 
        'Professional: Therapy (occupational, physical, speech)'
    ],
    'Social Sciences and Education': [
        'Social Science: Psychology', 'Social Science: Sociology', 'Social Science: Political Sciences', 
        'Social Science: Economics', 'Social Science: Anthropology', 'Social Science: Public Policy', 
        'Education-related fields here if any'
    ],
    'Business and Law': [
        'Business: Marketing', 'Business: Finance', 'Business: Accounting', 'Business: Management', 
        'Business: International Business', 'Business: Business Admin (general)', 
        'Law-related fields here if any'
    ],
    'Arts, Humanities, and Others': [
        'Arts and Humanities: Music', 'Arts and Humanities: History', 'Arts and Humanities: Philosophy', 
        'Arts and Humanities: Journalism', 'Arts and Humanities: English ( language and literature)', 
        'Arts and Humanities: Art, fine and applied', 'Arts and Humanities: Language and Literature (except English)', 
        'Arts and Humanities: Theology or Religion', 'Other Fields: Other Field', 'Other Fields: Undecided'
    ]
}

# Function to recategorize based on the new broader mapping
def recategorize_broader(value, category_mapping):
    for key, values in category_mapping.items():
        if value in values:
            return key
    return 'Other'  # Catch-all for any unspecified or mismatched categories

# Apply recategorization to the 'major' column with the revised groups
df['major'] = df['major'].apply(lambda x: recategorize_broader(x, major_groups))

numeric_mapping = {
    'STEM': 1,
    'Health and Life Sciences': 2,
    'Social Sciences and Education': 3,
    'Business and Law': 4,
    'Arts, Humanities, and Others': 5,
    'Other': 6
}

# Apply numeric mapping to the 'major' column
df['major'] = df['major'].map(numeric_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['major'] = df['major'].apply(lambda x: recategorize_broader(x, major_groups))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['major'] = df['major'].map(numeric_mapping)


Occupation mom

BINNING THE MOM OCCUPATION VALUES

In [8]:
# Consolidated occupation groups
consolidated_occupation_groups = {
    'Healthcare and Science': [
        'Nurse', 'Physician', 'Therapist', 'Dentists', 'Scientific researcher', 'Clinical psychologist'
    ],
    'Education and Public Service': [
        'Teacher or administrator', 'School counselor', 'College teacher', 'Policymaker/government'
    ],
    'Professional and Technical': [
        'Engineer', 'Computer programmer or analyst', 'Lab technician', 'Lawyer'
    ],
    'Business and Commerce': [
        'Business owner', 'Accountant', 'Business executive', 'Business salesperson'
    ],
    'Arts and Trades': [
        'Writer or journalist', 'Artist', 'Interior decorator', 'Laborer', 'Skilled trades'
    ],
    'Unemployed / Homemaker': ['Unemployed', 'Homemaker']
}

# Function to recategorize based on the new consolidated mapping
def recategorize_consolidated(value, category_mapping):
    for key, values in category_mapping.items():
        if value in values:
            return key
    return 'Other'  # Default catch-all for any unspecified or mismatched categories

# Apply recategorization to the 'occupationmom' column
df['occupationmom'] = df['occupationmom'].apply(lambda x: recategorize_consolidated(x, consolidated_occupation_groups))

# Numeric mapping for the recategorized 'occupationmom' column
occupation_numeric_mapping = {
    'Healthcare and Science': 1,
    'Education and Public Service': 2,
    'Professional and Technical': 3,
    'Business and Commerce': 4,
    'Arts and Trades': 5,
    'Unemployed / Homemaker': 6,
    'Other': 7  # Numeric code for 'Other' category
}

# Apply numeric mapping to the 'occupationmom_group' column
df['occupationmom'] = df['occupationmom'].map(occupation_numeric_mapping)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['occupationmom'] = df['occupationmom'].apply(lambda x: recategorize_consolidated(x, consolidated_occupation_groups))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['occupationmom'] = df['occupationmom'].map(occupation_numeric_mapping)


DROP UNNECESSARY COLUMNS

In [9]:
import pandas as pd

# Assuming 'df' is your DataFrame

# List of column names to drop
columns_to_drop = [
    'Intensity Score', 'Variety Score', 'Normalized Intensity Score', 
    'Normalized Variety Score', 'startlanguage', 'booksread', 'sender'
]

# Drop the columns in place
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Display the DataFrame to verify the changes
print(df.head())


   egoid  SurveyNr  enjoy_movies  extraversion  abortion  openness  \
0  26425         1           4.0         4.375         0       4.1   
1  55464         1           4.0         3.375        -1       3.4   
2  86352         1           3.0         3.500         2       3.8   
3  22745         1           4.0         2.625         5       3.5   
4  78638         1           4.0         4.375        -1       3.4   

   premaritalsex  euthanasia  homosexual  deathpen  ...  enjoy_books  \
0              0           0           0         0  ...          2.0   
1              4           2           4         0  ...          4.0   
2              0           2           0         2  ...          4.0   
3              2           5           4         0  ...          4.0   
4              0           5           0         0  ...          4.0   

   conscientiousness  lesseq  neuroticism  health  weight  parentsmarriage  \
0           3.333333       3        1.750       5   140.0           

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=columns_to_drop, inplace=True, errors='ignore')


BINNING WEIGHTS

In [10]:
missing_weight_count = df['weight'].isna().sum()
print(f"Number of students without specified weight: {missing_weight_count}")

Number of students without specified weight: 230


In [11]:
# Define bins and labels
bins = [0, 100, 150, 200, 250, 300]
labels = [1, 2, 3, 4, 5]  # Corresponding labels for the bins

# Perform the binning
df['weight_binned'] = pd.cut(df['weight'], bins=bins, labels=labels, right=False, include_lowest=True)

# Add a new category for NaN values and map it numerically
df['weight_binned'] = df['weight_binned'].cat.add_categories(['Not Shared'])
df['weight_binned'].fillna('Not Shared', inplace=True)

# Mapping categorical data to numerical values, including 'Not Shared'


num_mapping = {str(label): label for label in labels}
num_mapping['Not Shared'] = 0  # Assign a numerical value to 'Not Shared'
# Apply the numeric mapping
df['weight_binned'] = df['weight_binned'].map(num_mapping)


# Apply the numeric mapping
df['weight_binned'] = df['weight_binned'].map(num_mapping)

df.drop(columns=['weight'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weight_binned'] = pd.cut(df['weight'], bins=bins, labels=labels, right=False, include_lowest=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weight_binned'] = df['weight_binned'].cat.add_categories(['Not Shared'])
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] 

In [12]:
from Demographic_Features.helpers import drop_missing_data

df = drop_missing_data(df, threshold=20, axis=1)
df = drop_missing_data(df, threshold=15, axis=0)
df = df[df['SurveyNr'] != 1]

In [15]:
from Demographic_Features.helpers import impute_missing_values, calculate_missing_data

numerical_cols = [col for col in df.columns if col not in ['SurveyNr', 'egoid']]

df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

df = impute_missing_values(df, numerical_cols, [])
calculate_missing_data(df)

Percentage of Missing Data per Row:
203     0.0
204     0.0
205     0.0
206     0.0
207     0.0
       ... 
1213    0.0
1214    0.0
1215    0.0
1216    0.0
1217    0.0
Length: 1015, dtype: float64

Percentage of Missing Data per Column:
egoid                  0.0
SurveyNr               0.0
enjoy_movies           0.0
extraversion           0.0
abortion               0.0
openness               0.0
premaritalsex          0.0
euthanasia             0.0
homosexual             0.0
deathpen               0.0
marijuana              0.0
gaymarriage            0.0
eqchances              0.0
enjoy_games            0.0
enjoy_outdoor          0.0
enjoy_music            0.0
fssocsec               0.0
major                  0.0
occupationmom          0.0
political              0.0
toomucheqrights        0.0
happy                  0.0
racediscrim            0.0
agreeableness          0.0
enjoy_follow_sports    0.0
fswelfare              0.0
enjoy_books            0.0
conscientiousness      0.0
lesseq 

(203     0.0
 204     0.0
 205     0.0
 206     0.0
 207     0.0
        ... 
 1213    0.0
 1214    0.0
 1215    0.0
 1216    0.0
 1217    0.0
 Length: 1015, dtype: float64,
 egoid                  0.0
 SurveyNr               0.0
 enjoy_movies           0.0
 extraversion           0.0
 abortion               0.0
 openness               0.0
 premaritalsex          0.0
 euthanasia             0.0
 homosexual             0.0
 deathpen               0.0
 marijuana              0.0
 gaymarriage            0.0
 eqchances              0.0
 enjoy_games            0.0
 enjoy_outdoor          0.0
 enjoy_music            0.0
 fssocsec               0.0
 major                  0.0
 occupationmom          0.0
 political              0.0
 toomucheqrights        0.0
 happy                  0.0
 racediscrim            0.0
 agreeableness          0.0
 enjoy_follow_sports    0.0
 fswelfare              0.0
 enjoy_books            0.0
 conscientiousness      0.0
 lesseq                 0.0
 neuroticism  

Delete highly correlated features, or the one described by the low variance.



In [16]:
from Demographic_Features.helpers import remove_highly_correlated_features

df = remove_highly_correlated_features(df, correlation_threshold=0.85)

Dropped 11 highly correlated features.


Remove low variance features

In [18]:
from Demographic_Features.helpers import remove_low_variance_features

df = remove_low_variance_features(df, threshold=0.05)

Reduced from 23 to 23 features.


Transform SKEWED DATA - NO RF deals with that

In [None]:
"""
from Demographic_Features.helpers import transform_skewed_data
numerical_cols = [col for col in df.columns if col not in ['SurveyNr', 'egoid']]
df = transform_skewed_data(df, numerical_cols)
"""

In [19]:

dirPath = r'C:/Users/Michalina/MasterThesis/PWR/coding-fairness/Demographic_Features/'
filePath = '../data/Opinion_Features_Students.csv'
df.to_csv(dirPath+filePath, index=False)

In [14]:
calculate_missing_data(df)

Percentage of Missing Data per Row:
203      0.000000
204      0.000000
205      0.000000
206      0.000000
207      0.000000
          ...    
1213     0.000000
1214    14.285714
1215    14.285714
1216     0.000000
1217    14.285714
Length: 1015, dtype: float64

Percentage of Missing Data per Column:
egoid                   0.000000
SurveyNr                0.000000
enjoy_movies            0.000000
extraversion           22.561576
abortion                0.000000
openness               22.561576
premaritalsex           0.000000
euthanasia              0.000000
homosexual              0.000000
deathpen                0.000000
marijuana               0.000000
gaymarriage             0.000000
eqchances               0.000000
enjoy_games             0.000000
enjoy_outdoor           0.000000
enjoy_music             0.000000
fssocsec                0.000000
major                   0.000000
occupationmom           0.000000
political               0.000000
toomucheqrights         0.000000
happ

(203      0.000000
 204      0.000000
 205      0.000000
 206      0.000000
 207      0.000000
           ...    
 1213     0.000000
 1214    14.285714
 1215    14.285714
 1216     0.000000
 1217    14.285714
 Length: 1015, dtype: float64,
 egoid                   0.000000
 SurveyNr                0.000000
 enjoy_movies            0.000000
 extraversion           22.561576
 abortion                0.000000
 openness               22.561576
 premaritalsex           0.000000
 euthanasia              0.000000
 homosexual              0.000000
 deathpen                0.000000
 marijuana               0.000000
 gaymarriage             0.000000
 eqchances               0.000000
 enjoy_games             0.000000
 enjoy_outdoor           0.000000
 enjoy_music             0.000000
 fssocsec                0.000000
 major                   0.000000
 occupationmom           0.000000
 political               0.000000
 toomucheqrights         0.000000
 happy                   0.000000
 racediscrim