In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df1 = pd.read_csv(r'C:\Users\Johannes\Documents\Uni\Master\Tech_Labs\ws24-skillswap\DS\01_Notebooks\01_Synthetic_Data_Set\Skillswap_Data_Labeled.csv')

df1.head()

Unnamed: 0,First_Name,EMail,Age,Gender,Location_Preferences,Latitude,Longitude,Country,City,Languages_Preferred,...,Technology,Photo,Video,Usability,Speed_Performance,Feature_Satisfaction,Security_Satisfaction,Customer_Support,Activity_Status,Sum_Skills_Off_ab_avg
0,Maurice,maurice.zengin@hotmail.com,24,Prefer not to say,Local or Remote,52.513699,13.212129,Germany,Berlin,['Ukrainian'],...,,,,Dissatisfied,Satisfied,Very Satisfied,Neutral or Average,Satisfied,Swapper,1
1,Donatello,donatello.holt@icloud.com,30,Male,Local or Remote,52.480158,13.756927,Germany,Berlin,"['Ukrainian', 'German', 'Portuguese', 'Punjabi...",...,,9.0,,Very Dissatisfied,Very Dissatisfied,Neutral or Average,Very Satisfied,Dissatisfied,Active User,0
2,Gudula,gudula.lima@protonmail.com,27,Female,Remote Only,52.663405,13.213178,Germany,Berlin,"['English', 'German', 'Polish']",...,,,,Satisfied,Neutral or Average,Satisfied,Dissatisfied,Dissatisfied,Active User,0
3,Sunay,sunay.stey@unitybox.de,24,Male,Remote Only,52.465162,13.479279,Germany,Berlin,"['Turkish', 'Bengali', 'Mandarin Chinese', 'Po...",...,,,,Satisfied,Dissatisfied,Very Satisfied,Satisfied,Dissatisfied,Active User,0
4,Alexandros,alexandros.eberth@protonmail.com,28,Male,Remote Only,52.652361,13.144213,Germany,Berlin,"['Japanese', 'Romanian']",...,,,0.0,Dissatisfied,Neutral or Average,Very Satisfied,Very Satisfied,Neutral or Average,Inactive,0


In [3]:
# Prepare for encoding of categorical values
# Drop Domains to avoid redundancy towards Sum_Skills_Off
columns_to_drop = [
    'First_Name', 
    'EMail', 
    'Languages_Preferred',  
    ]
df2 = df1.drop(columns=columns_to_drop).copy()
df2.head()

Unnamed: 0,Age,Gender,Location_Preferences,Latitude,Longitude,Country,City,Last_Seen,Response_Time,Farming_off,...,Technology,Photo,Video,Usability,Speed_Performance,Feature_Satisfaction,Security_Satisfaction,Customer_Support,Activity_Status,Sum_Skills_Off_ab_avg
0,24,Prefer not to say,Local or Remote,52.513699,13.212129,Germany,Berlin,Within a Week,Within a week,,...,,,,Dissatisfied,Satisfied,Very Satisfied,Neutral or Average,Satisfied,Swapper,1
1,30,Male,Local or Remote,52.480158,13.756927,Germany,Berlin,Recently,Within a week,,...,,9.0,,Very Dissatisfied,Very Dissatisfied,Neutral or Average,Very Satisfied,Dissatisfied,Active User,0
2,27,Female,Remote Only,52.663405,13.213178,Germany,Berlin,Within a Week,Within a week,,...,,,,Satisfied,Neutral or Average,Satisfied,Dissatisfied,Dissatisfied,Active User,0
3,24,Male,Remote Only,52.465162,13.479279,Germany,Berlin,Within a Week,Within a week,,...,,,,Satisfied,Dissatisfied,Very Satisfied,Satisfied,Dissatisfied,Active User,0
4,28,Male,Remote Only,52.652361,13.144213,Germany,Berlin,Over a Month Ago,Within a week,,...,,,0.0,Dissatisfied,Neutral or Average,Very Satisfied,Very Satisfied,Neutral or Average,Inactive,0


In [4]:
df_cat = df2.select_dtypes(object)
df_num = df2.select_dtypes(np.number)

In [5]:
df_num.head()

Unnamed: 0,Age,Latitude,Longitude,Farming_dem,Fishery_dem,Gardening_dem,Pet grooming_dem,Pet training_dem,Drawing_dem,Painting_dem,...,Music,Writing,Language,Health,Sports,Food,Technology,Photo,Video,Sum_Skills_Off_ab_avg
0,24,52.513699,13.212129,1,1,1,0,0,1,1,...,,,,,18.0,,,,,1
1,30,52.480158,13.756927,1,1,1,0,0,0,0,...,,,,,,,,9.0,,0
2,27,52.663405,13.213178,0,0,0,1,1,0,0,...,,,0.0,,,,,,,0
3,24,52.465162,13.479279,0,0,0,0,0,1,1,...,,,,,,,,,,0
4,28,52.652361,13.144213,0,0,0,0,0,0,0,...,,,,,,,,,0.0,0


In [6]:
df_num2 = df_num.drop(columns=["Longitude", "Latitude"])

# Step 1: Replace NaN values with "0"
# Step 2: Convert all numeric columns to integers

df_num2 = df_num2.fillna(0)

for column in df_num2.columns:
    if pd.api.types.is_numeric_dtype(df_num2[column]):
        df_num2[column] = df_num2[column].astype(int)

In [7]:
df_num_cleaned = pd.concat([df_num[["Longitude", "Latitude"]], df_num2], axis=1)
df_num_cleaned

Unnamed: 0,Longitude,Latitude,Age,Farming_dem,Fishery_dem,Gardening_dem,Pet grooming_dem,Pet training_dem,Drawing_dem,Painting_dem,...,Music,Writing,Language,Health,Sports,Food,Technology,Photo,Video,Sum_Skills_Off_ab_avg
0,13.212129,52.513699,24,1,1,1,0,0,1,1,...,0,0,0,0,18,0,0,0,0,1
1,13.756927,52.480158,30,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,9,0,0
2,13.213178,52.663405,27,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,13.479279,52.465162,24,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,13.144213,52.652361,28,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,13.508489,52.343721,27,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11,0,1
9996,13.533657,52.504061,24,0,0,0,0,0,0,0,...,0,11,0,0,0,0,0,0,0,1
9997,13.165508,52.398269,24,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
9998,13.462890,52.543458,26,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
df_cat.head()

Unnamed: 0,Gender,Location_Preferences,Country,City,Last_Seen,Response_Time,Farming_off,Fishery_off,Gardening_off,Pet grooming_off,...,Animation_off,Motion graphics_off,Visual effects_off,Skill Domains,Usability,Speed_Performance,Feature_Satisfaction,Security_Satisfaction,Customer_Support,Activity_Status
0,Prefer not to say,Local or Remote,Germany,Berlin,Within a Week,Within a week,,,,Advanced,...,,,,"['Food', 'Animal Care', 'Sports']",Dissatisfied,Satisfied,Very Satisfied,Neutral or Average,Satisfied,Swapper
1,Male,Local or Remote,Germany,Berlin,Recently,Within a week,,,,,...,,,,['Photo'],Very Dissatisfied,Very Dissatisfied,Neutral or Average,Very Satisfied,Dissatisfied,Active User
2,Female,Remote Only,Germany,Berlin,Within a Week,Within a week,,,,,...,,,,"['Photo', 'Language', 'Technology']",Satisfied,Neutral or Average,Satisfied,Dissatisfied,Dissatisfied,Active User
3,Male,Remote Only,Germany,Berlin,Within a Week,Within a week,,,,,...,,,,"['Language', 'Sports', 'Art']",Satisfied,Dissatisfied,Very Satisfied,Satisfied,Dissatisfied,Active User
4,Male,Remote Only,Germany,Berlin,Over a Month Ago,Within a week,,,,,...,Beginner,Beginner,Advanced,"['Video', 'Art', 'Writing']",Dissatisfied,Neutral or Average,Very Satisfied,Very Satisfied,Neutral or Average,Inactive


In [9]:
# One-hot encoding for columns containing nominal categorical values
selected_columns = ['Gender', 'Location_Preferences']
df_cat2 = df_cat[selected_columns].copy()

In [10]:
# One-hot encoding for columns containing nominal categorical values
df_cat2.head()

Unnamed: 0,Gender,Location_Preferences
0,Prefer not to say,Local or Remote
1,Male,Local or Remote
2,Female,Remote Only
3,Male,Remote Only
4,Male,Remote Only


In [11]:
# One-hot encoding for columns containing nominal categorical values
cat_encoder = OneHotEncoder()
cat2_1hot = cat_encoder.fit_transform(df_cat2)

In [12]:
# One-hot encoding for columns containing nominal categorical values
cat2_1hot.toarray()

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 1., 0., 0.]])

In [13]:
dense_array = cat2_1hot.toarray()
dense_array_int = dense_array.astype(int)
column_names = cat_encoder.get_feature_names_out()

df_cat2_1hot = pd.DataFrame(dense_array_int, columns=column_names)
df_cat2_1hot

Unnamed: 0,Gender_Diverse,Gender_Female,Gender_Male,Gender_Prefer not to say,Location_Preferences_Local Only,Location_Preferences_Local or Remote,Location_Preferences_Remote Only
0,0,0,0,1,0,1,0
1,0,0,1,0,0,1,0
2,0,1,0,0,0,0,1
3,0,0,1,0,0,0,1
4,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...
9995,0,0,1,0,0,0,1
9996,0,1,0,0,0,0,1
9997,0,0,1,0,1,0,0
9998,0,0,1,0,0,1,0


In [14]:
# Ordinal Encoding for columns containing ordinal categorical values
columns_to_drop = ['Country', 'City', 'Gender', 'Location_Preferences']
df_cat3 = df_cat.drop(columns=columns_to_drop).copy()

In [15]:
df_cat3.head()

Unnamed: 0,Last_Seen,Response_Time,Farming_off,Fishery_off,Gardening_off,Pet grooming_off,Pet training_off,Drawing_off,Painting_off,Sculpture_off,...,Animation_off,Motion graphics_off,Visual effects_off,Skill Domains,Usability,Speed_Performance,Feature_Satisfaction,Security_Satisfaction,Customer_Support,Activity_Status
0,Within a Week,Within a week,,,,Advanced,Novice,,,,...,,,,"['Food', 'Animal Care', 'Sports']",Dissatisfied,Satisfied,Very Satisfied,Neutral or Average,Satisfied,Swapper
1,Recently,Within a week,,,,,,,,,...,,,,['Photo'],Very Dissatisfied,Very Dissatisfied,Neutral or Average,Very Satisfied,Dissatisfied,Active User
2,Within a Week,Within a week,,,,,,,,,...,,,,"['Photo', 'Language', 'Technology']",Satisfied,Neutral or Average,Satisfied,Dissatisfied,Dissatisfied,Active User
3,Within a Week,Within a week,,,,,,Novice,Expert,Novice,...,,,,"['Language', 'Sports', 'Art']",Satisfied,Dissatisfied,Very Satisfied,Satisfied,Dissatisfied,Active User
4,Over a Month Ago,Within a week,,,,,,Novice,Novice,Novice,...,Beginner,Beginner,Advanced,"['Video', 'Art', 'Writing']",Dissatisfied,Neutral or Average,Very Satisfied,Very Satisfied,Neutral or Average,Inactive


In [16]:
last_seen_mapping = {
    'Recently': 5,
    'Today': 4,
    'Yesterday': 3,
    'Within a Week': 2,
    'Within a Month': 1,
    'Over a Month Ago': 0
}

df_cat3['Last_Seen'] = df_cat3['Last_Seen'].replace(last_seen_mapping)

In [17]:
response_time_mapping = {
    'Within 24 hours': 3,
    'Within a week': 2,
    'Within a month': 1,
    'More than a month': 0
}

df_cat3['Response_Time'] = df_cat3['Response_Time'].replace(response_time_mapping)

In [18]:
skills_mappings = {
    np.nan: '0', 
    'Novice': 1, 
    'Beginner': 2, 
    'Intermediate': 3, 
    'Advanced' : 4, 
    'Expert' : 5,       
   
}

for column in df_cat3.columns:
    df_cat3[column] = df_cat3[column].replace(skills_mappings)
    
for column in df_cat3.columns:
    try:
        df_cat3[column] = df_cat3[column].astype(int)
    except ValueError:
        pass 

In [19]:
feedback_mappings = {
    'Very Dissatisfied': 1, 'Dissatisfied': 2, 'Neutral or Average': 3, 'Satisfied': 4, 'Very Satisfied': 5,       
   
}

for column in df_cat3.columns:
    df_cat3[column] = df_cat3[column].replace(feedback_mappings)
    
for column in df_cat3.columns:
    try:
        df_cat3[column] = df_cat3[column].astype(int)
    except ValueError:
        pass 

In [20]:
status_mappings = {
    'Swapper': 3,
    'Active User': 2,
    'Occasional User': 1,
    'Inactive': 0
}
df_cat3['Activity_Status'] = df_cat3['Activity_Status'].replace(status_mappings)

In [21]:
df_cat3.head()

Unnamed: 0,Last_Seen,Response_Time,Farming_off,Fishery_off,Gardening_off,Pet grooming_off,Pet training_off,Drawing_off,Painting_off,Sculpture_off,...,Animation_off,Motion graphics_off,Visual effects_off,Skill Domains,Usability,Speed_Performance,Feature_Satisfaction,Security_Satisfaction,Customer_Support,Activity_Status
0,2,2,0,0,0,4,1,0,0,0,...,0,0,0,"['Food', 'Animal Care', 'Sports']",2,4,5,3,4,3
1,5,2,0,0,0,0,0,0,0,0,...,0,0,0,['Photo'],1,1,3,5,2,2
2,2,2,0,0,0,0,0,0,0,0,...,0,0,0,"['Photo', 'Language', 'Technology']",4,3,4,2,2,2
3,2,2,0,0,0,0,0,1,5,1,...,0,0,0,"['Language', 'Sports', 'Art']",4,2,5,4,2,2
4,0,2,0,0,0,0,0,1,1,1,...,2,2,4,"['Video', 'Art', 'Writing']",2,3,5,5,3,0


In [22]:
concatenated_df = pd.concat([df_cat2_1hot, df_num_cleaned, df_cat3], axis=1)

# relocate Sum_Skills_Off_ab_avg Column to the end of table for a clearer data overview
column_relocated = 'Sum_Skills_Off_ab_avg'
relocation = concatenated_df.pop(column_relocated)

new_column_index = len(concatenated_df.columns) - 1

concatenated_df.insert(new_column_index, column_relocated, relocation)

In [23]:
concatenated_df

Unnamed: 0,Gender_Diverse,Gender_Female,Gender_Male,Gender_Prefer not to say,Location_Preferences_Local Only,Location_Preferences_Local or Remote,Location_Preferences_Remote Only,Longitude,Latitude,Age,...,Motion graphics_off,Visual effects_off,Skill Domains,Usability,Speed_Performance,Feature_Satisfaction,Security_Satisfaction,Customer_Support,Sum_Skills_Off_ab_avg,Activity_Status
0,0,0,0,1,0,1,0,13.212129,52.513699,24,...,0,0,"['Food', 'Animal Care', 'Sports']",2,4,5,3,4,1,3
1,0,0,1,0,0,1,0,13.756927,52.480158,30,...,0,0,['Photo'],1,1,3,5,2,0,2
2,0,1,0,0,0,0,1,13.213178,52.663405,27,...,0,0,"['Photo', 'Language', 'Technology']",4,3,4,2,2,0,2
3,0,0,1,0,0,0,1,13.479279,52.465162,24,...,0,0,"['Language', 'Sports', 'Art']",4,2,5,4,2,0,2
4,0,0,1,0,0,0,1,13.144213,52.652361,28,...,2,4,"['Video', 'Art', 'Writing']",2,3,5,5,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,1,0,0,0,1,13.508489,52.343721,27,...,0,0,"['Art', 'Photo']",1,4,5,4,5,1,1
9996,0,1,0,0,0,0,1,13.533657,52.504061,24,...,0,0,"['Music', 'Writing']",5,1,4,2,4,1,3
9997,0,0,1,0,1,0,0,13.165508,52.398269,24,...,0,0,['Health'],4,2,2,1,4,0,1
9998,0,0,1,0,0,1,0,13.462890,52.543458,26,...,0,0,"['Agriculture', 'Animal Care']",1,4,4,2,3,1,1


In [24]:
concatenated_df.to_csv('SSW_Data_Encoded.csv', index=False)
