In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("Cleaned_GenZ_DatingApp_Data.csv")

In [7]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler

Q1: Feature Engineering for Future Modeling 

We can use One-Hot Encoding for variables like 'Primary_App', 'Gender', and 'Reason_for_Using', converting each category into separate binary columns. For ordinal variables like 'Usage_Frequency' (Daily, Weekly, Monthly), we can use Ordinal Encoding by mapping them to numerical values. Label Encoding can be used for simple categorical columns like 'Gender', assigning unique integers to each category.

Q2: Should we normalize numerical data? Why or why not?

Yes, normalization is important for columns like 'Age' and 'Daily_Usage_Time' to ensure that features with larger scales don't dominate the model. This is especially important for algorithms like K-Nearest Neighbors or SVM. However, for models like Decision Trees or Random Forests, normalization is not necessary since they are not sensitive to feature scales.

Q3: What new features could we create to enhance predictive modeling?

We could create an Engagement Score by combining 'Daily_Usage_Time' and 'Usage_Frequency', or derive a Satisfaction Category by binning 'Satisfaction' into high, medium, and low satisfaction levels. Additionally, we could generate a Metro vs Non-Metro indicator from the 'Location' column to analyze user behavior differences based on urbanization.



Activity 1: Encode categorical variables using One-Hot Encoding or Label Encoding

In [4]:
categorical_columns = ['Gender', 'Location', 'Education', 'Occupation', 'Primary_App', 'Secondary_Apps', 
                       'Usage_Frequency', 'Reason_for_Using', 'Challenges', 'Desired_Features', 
                       'Preferred_Communication', 'Partner_Priorities']

#Apply One-Hot Encoding
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical = one_hot_encoder.fit_transform(data[categorical_columns])

#Convert encoded data to DataFrame
df_encoded = pd.DataFrame(encoded_categorical, columns=one_hot_encoder.get_feature_names_out(categorical_columns))

# Display the encoded DataFrame
print(df_encoded.head())

   Gender_male  Gender_non-binary  Location_chennai  Location_delhi  \
0          0.0                1.0               0.0             0.0   
1          0.0                0.0               0.0             1.0   
2          0.0                1.0               0.0             0.0   
3          0.0                1.0               0.0             1.0   
4          1.0                0.0               0.0             1.0   

   Location_hyderabad  Location_kolkata  Location_mumbai  Location_pune  \
0                 0.0               0.0              0.0            0.0   
1                 0.0               0.0              0.0            0.0   
2                 0.0               1.0              0.0            0.0   
3                 0.0               0.0              0.0            0.0   
4                 0.0               0.0              0.0            0.0   

   Education_postgraduate  Education_undergraduate  ...  \
0                     0.0                      1.0  ...   
1   

Activity 2: Normalize numerical variables using MinMaxScaler or StandardScaler

In [10]:
numerical_columns = ['Age', 'Daily_Usage_Time', 'Satisfaction']

scaler = MinMaxScaler()
normalized_numerical = scaler.fit_transform(data[numerical_columns])

# Convert the normalized data back to a DataFrame
normalized_date = pd.DataFrame(normalized_numerical, columns=numerical_columns)

Activity 3: Create a new feature, "active app count", by summing the number of apps used per user.

In [12]:
data['Active_App_Count'] = data.apply(lambda row: 2 if row['Primary_App'] != 'None' and row['Secondary_Apps'] != 'None' else 1 if row['Primary_App'] != 'None' or row['Secondary_Apps'] != 'None' else 0, axis=1)

In [13]:
data.head()

Unnamed: 0,User_ID,Age,Gender,Location,Education,Occupation,Primary_App,Secondary_Apps,Usage_Frequency,Daily_Usage_Time,Reason_for_Using,Satisfaction,Challenges,Desired_Features,Preferred_Communication,Partner_Priorities,Active_App_Count
0,1,20,non-binary,bangalore,undergraduate,freelancer,hinge,hinge,monthly,60.0,finding a partner,4,safety concerns,audio calls,video calls,Values > Personality > Appearance,2
1,2,24,female,delhi,undergraduate,part-time job,hinge,okcupid,weekly,30.0,casual dating,5,time-wasting,video calls,text,Values > Personality > Appearance,2
2,3,24,non-binary,kolkata,undergraduate,intern,none,none,weekly,120.0,casual dating,4,safety concerns,detailed profiles,text,Values > Personality > Appearance,2
3,4,22,non-binary,delhi,graduate,full-time job,none,okcupid,daily,30.0,casual fun,3,unknown,ai recommendations,voice notes,Personality > Interests > Values,2
4,5,18,male,delhi,graduate,intern,okcupid,okcupid,weekly,120.0,casual fun,4,safety concerns,video calls,text,Appearance > Interests > Personality,2
