# Load data

In [64]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.impute import KNNImputer


# Load the CSV data
train_csv = pd.read_csv('../data/raw/train.csv')

# Load the JSON-derived groups data
train_groups_df = pd.read_pickle('../data/processed/train_groups_df.pkl')  # Assuming you saved this as a pickle

# Inspect the data
display(train_csv.head())
display(train_groups_df.head())

Unnamed: 0,user_id,target,name,sex,dob,location,location_population,location_from,location_from_population,occupation,hobbies,daily_commute,friends_number,relationship_status,education,credit_card_type
0,0,1,Halina,,1982-08-07,Piastów,22732,Piastów,22732,Teaching professionals,Fitness,46.0,196,Single,,Visa
1,1,0,Eustachy,male,1971-10-28,Sokółka,18331,Sokółka,18331,General and keyboard clerks,"LARPing,Foreign language learning,Netball",55.0,243,Single,,
2,2,1,Egon,,2000-07-05,Łaskarzew,4879,Łaskarzew,4879,Protective services workers,"Bodybuilding,Kabaddi",90.0,191,In relationship,3.0,
3,3,0,Eulalia,female,1992-06-10,Bydgoszcz,352313,Bydgoszcz,352313,Customer services clerks,Badminton,88.0,164,In relationship,3.0,Visa
4,4,0,Hilary,,1975-01-09,Osieczna,2322,Poznań,538633,Refuse workers and other elementary workers,"Fitness,Embroidery,Lacemaking",40.0,119,Married with kids,5.0,


Unnamed: 0,Unnamed: 1,group_name,date_joined,year_joined,month_joined,weekday_joined
0,0,Let's excercise together and lose a few kilo q...,2008-09-05 09:55:18.730066,2008,9,4
0,1,Strongman competition,2008-05-22 21:25:22.572365,2008,5,3
0,2,Fast food 4 life,2012-02-02 05:26:01.293628,2012,2,3
0,3,alternative medicine - Hypnosis and bioenergot...,2008-07-05 05:47:12.254848,2008,7,5
1,0,Tom Cruise group,2009-06-14 16:48:28.606142,2009,6,6


# Data engineering

### Remove irrelevant columns

In [65]:
# Drop columns with low utility for modeling
columns_to_drop = ['name']
train_csv = train_csv.drop(columns=columns_to_drop)


### Feature engineering

In [66]:
train_csv['dob'] = pd.to_datetime(train_csv['dob'], errors='coerce')  # Handle invalid dates
train_csv['age'] = datetime.now().year - train_csv['dob'].dt.year

# Remove 'dob' after transformation
train_csv = train_csv.drop(columns=['dob'])


In [67]:
# Bin locations by population size
bins = [0, 5000, 20000, 50000, 100000, np.inf]
labels = ['Rural', 'Small Town', 'Town', 'City', 'Metro']
train_csv['location_category'] = pd.cut(train_csv['location_population'], bins=bins, labels=labels)
train_csv['location_from_category'] = pd.cut(train_csv['location_from_population'], bins=bins, labels=labels)

# Remove columns after binning
train_csv = train_csv.drop(columns=['location_population', 'location_from_population', 'location', 'location_from'])

### Data from groups joined

In [68]:
# Number of groups joined by each user
group_counts = train_groups_df.groupby(level=0)['group_name'].count()
group_counts

# train_csv = train_csv.merge(group_counts, left_on='user_id', right_index=True, how='left')
# train_csv['group_count'] = pd.concat([group_counts], axis=1, keys=['group_count'], sort=True)['group_count']

0       4
1       5
10      2
100     5
1000    9
       ..
995     5
996     1
997     3
998     3
999     1
Name: group_name, Length: 3944, dtype: int64

In [69]:
# Perform one-hot encoding for the specified columns
categorical_columns = ['sex', 'occupation', 'relationship_status', 'credit_card_type']

# Use pd.get_dummies to one-hot encode the columns
train_csv = pd.get_dummies(train_csv, columns=categorical_columns, drop_first=True)

In [70]:
# Display updated DataFrame
train_csv.head()

Unnamed: 0,user_id,target,hobbies,daily_commute,friends_number,education,age,location_category,location_from_category,sex_male,...,occupation_Street and related sales and service workers,"occupation_Subsistence farmers, fishers, hunters and gatherers",occupation_Teaching professionals,relationship_status_In relationship,relationship_status_Married,relationship_status_Married with kids,relationship_status_Single,credit_card_type_Mastercard,credit_card_type_Revolut,credit_card_type_Visa
0,0,1,Fitness,46.0,196,,42.0,Town,Town,False,...,False,False,True,False,False,False,True,False,False,True
1,1,0,"LARPing,Foreign language learning,Netball",55.0,243,,53.0,Small Town,Small Town,True,...,False,False,False,False,False,False,True,False,False,False
2,2,1,"Bodybuilding,Kabaddi",90.0,191,3.0,24.0,Rural,Rural,False,...,False,False,False,True,False,False,False,False,False,False
3,3,0,Badminton,88.0,164,3.0,32.0,Metro,Metro,False,...,False,False,False,True,False,False,False,False,False,True
4,4,0,"Fitness,Embroidery,Lacemaking",40.0,119,5.0,49.0,Rural,Metro,False,...,False,False,False,False,False,True,False,False,False,False


### Missing values

In [None]:
# Initialize KNN Imputer
imputer = KNNImputer(n_neighbors=5)

# Select only numerical columns for KNN Imputation
numeric_columns = ['daily_commute', 'age']
train_csv[numeric_columns] = imputer.fit_transform(train_csv[numeric_columns])

# Create a missing indicator
train_csv['education_missing'] = train_csv['education'].isnull().astype(int)

# Impute missing values with the median
train_csv['education'] = train_csv['education'].fillna(train_csv['education'].median())


In [72]:
print(train_csv.isnull().sum())

user_id                                                                                           0
target                                                                                            0
hobbies                                                                                         680
daily_commute                                                                                     0
friends_number                                                                                    0
education                                                                                         0
age                                                                                               0
location_category                                                                                 0
location_from_category                                                                            0
sex_male                                                                                          0


In [73]:
# hobbies -> bert -> embeddings -> clustering -> insights (new columns?)

# EDA followup

In [74]:
train_csv.isnull().sum()

user_id                                                                                           0
target                                                                                            0
hobbies                                                                                         680
daily_commute                                                                                     0
friends_number                                                                                    0
education                                                                                         0
age                                                                                               0
location_category                                                                                 0
location_from_category                                                                            0
sex_male                                                                                          0


In [75]:
train_csv.head()

Unnamed: 0,user_id,target,hobbies,daily_commute,friends_number,education,age,location_category,location_from_category,sex_male,...,"occupation_Subsistence farmers, fishers, hunters and gatherers",occupation_Teaching professionals,relationship_status_In relationship,relationship_status_Married,relationship_status_Married with kids,relationship_status_Single,credit_card_type_Mastercard,credit_card_type_Revolut,credit_card_type_Visa,education_missing
0,0,1,Fitness,46.0,196,4.0,42.0,Town,Town,False,...,False,True,False,False,False,True,False,False,True,1
1,1,0,"LARPing,Foreign language learning,Netball",55.0,243,4.0,53.0,Small Town,Small Town,True,...,False,False,False,False,False,True,False,False,False,1
2,2,1,"Bodybuilding,Kabaddi",90.0,191,3.0,24.0,Rural,Rural,False,...,False,False,True,False,False,False,False,False,False,0
3,3,0,Badminton,88.0,164,3.0,32.0,Metro,Metro,False,...,False,False,True,False,False,False,False,False,True,0
4,4,0,"Fitness,Embroidery,Lacemaking",40.0,119,5.0,49.0,Rural,Metro,False,...,False,False,False,False,True,False,False,False,False,0
