In [1]:
import pandas as pd
import numpy as np
import matplotlib as plot

In [2]:
data = pd.read_csv("GenZ_DatingApp_Data.csv")

In [7]:
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category')

In [9]:
data['Primary_App'] = data['Primary_App'].fillna(data['Primary_App'].mode()[0])

In [10]:
data['Secondary_Apps'] = data['Secondary_Apps'].fillna(data['Secondary_Apps'].mode()[0])

In [11]:
data['Challenges'] = data['Challenges'].fillna(data['Challenges'].mode()[0])

In [12]:
missing_values_per_column = data.isnull().sum()
print(missing_values_per_column)

User_ID                    0
Age                        0
Gender                     0
Location                   0
Education                  0
Occupation                 0
Primary_App                0
Secondary_Apps             0
Usage_Frequency            0
Daily_Usage_Time           0
Reason_for_Using           0
Satisfaction               0
Challenges                 0
Desired_Features           0
Preferred_Communication    0
Partner_Priorities         0
dtype: int64


In [19]:
# If you were a new data scientist joining this project, what would you need to understand this dataset?

# If I was a new student, these are what I would have done to understand this dataset. First, I would need an overview of the dataset, 
# a sort of description to make sense of the data presented.
# Summary of numerical data
print(data.describe().map('{:.2f}%'.format))

       User_ID      Age Satisfaction
count  500.00%  500.00%      500.00%
mean   250.50%   21.58%        2.91%
std    144.48%    2.26%        1.44%
min      1.00%   18.00%        1.00%
25%    125.75%   20.00%        2.00%
50%    250.50%   22.00%        3.00%
75%    375.25%   24.00%        4.00%
max    500.00%   25.00%        5.00%


In [18]:
# Summary of categorical data

for col in data.select_dtypes(include='category').columns:
    print(f"{col}:")
    print(data[col].value_counts(), "\n")

Gender:
Gender
Female        176
Male          171
Non-binary    153
Name: count, dtype: int64 

Location:
Location
Mumbai       82
Delhi        76
Kolkata      76
Bangalore    69
Pune         67
Chennai      66
Hyderabad    64
Name: count, dtype: int64 

Education:
Education
Graduate         169
Undergraduate    166
Postgraduate     165
Name: count, dtype: int64 

Occupation:
Occupation
Student          106
Intern           105
Freelancer       103
Full-time Job    100
Part-time Job     86
Name: count, dtype: int64 

Primary_App:
Primary_App
OkCupid    225
Hinge      106
Bumble      93
Tinder      76
Name: count, dtype: int64 

Secondary_Apps:
Secondary_Apps
Hinge      217
OkCupid     96
Bumble      95
Tinder      92
Name: count, dtype: int64 

Usage_Frequency:
Usage_Frequency
Daily      181
Weekly     161
Monthly    158
Name: count, dtype: int64 

Daily_Usage_Time:
Daily_Usage_Time
1.5 hours     109
2 hours       104
3 hours       102
1 hour         97
30 minutes     88
Name: count, 

In [31]:
# How do you write effective column descriptions? 

# This is how I'll proceed to write an effective column description. I've created a small dictionary to show all the columns

data_dictionary = pd.DataFrame({
    'Column Name': data.columns,
    'Data Type': data.dtypes.values,
    'Missing Values': data.isnull().sum().values,
    'Unique Values': [data[col].nunique() for col in data.columns],
    'Sample Values': [data[col].unique()[:5] for col in data.columns]  
})

display(data_dictionary)

Unnamed: 0,Column Name,Data Type,Missing Values,Unique Values,Sample Values
0,User_ID,int64,0,500,"[1, 2, 3, 4, 5]"
1,Age,int64,0,8,"[20, 24, 22, 18, 23]"
2,Gender,category,0,3,"['Non-binary', 'Female', 'Male'] Categories (3..."
3,Location,category,0,7,"['Bangalore', 'Delhi', 'Kolkata', 'Mumbai', 'C..."
4,Education,category,0,3,"['Undergraduate', 'Graduate', 'Postgraduate'] ..."
5,Occupation,category,0,5,"['Freelancer', 'Part-time Job', 'Intern', 'Ful..."
6,Primary_App,category,0,4,"['Hinge', 'OkCupid', 'Tinder', 'Bumble'] Categ..."
7,Secondary_Apps,category,0,4,"['Hinge', 'OkCupid', 'Tinder', 'Bumble'] Categ..."
8,Usage_Frequency,category,0,3,"['Monthly', 'Weekly', 'Daily'] Categories (3, ..."
9,Daily_Usage_Time,category,0,5,"['1 hour', '30 minutes', '2 hours', '1.5 hours..."


In [None]:
# What’s the best way to track changes in a dataset over time?

# There are various ways to tracks changes in a dataset over time. One of the way is by adding Timestamp and change logs. 
# Adding columns like 'created_at', 'updated_at', or 'deleted_at' in the dataset will help in tracking when records were added, modified, or removed. 