**Installing the Faker Library**

In [None]:
# Installing the faker library which helps in generating synthetic data
# Useful for creating synthetic datasets for testing or practice

!pip install faker

Collecting faker
  Downloading faker-37.4.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.2-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m109.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.4.2


**Import Other Required Libraries**

In [None]:
# Libraries
import pandas as pd           #For creating and handling dataframes
import uuid                   #For generating unique IDs (universally unique identifiers)
import random                 #For generating random numbers
from faker import Faker       #For generating fake data (names, addresses, etc.)

**Defining the Number of rows or users**

In [None]:
num_users=100  #Set the number of synthetic user records we want to generate

**Initializing Dataset with Defined Features**

Feature List:

Id: A unique string of characters to identify each user.

Gender: A string of 3 characters

Subscriber: A binary (True/False) indicating subscription status.

Name: A full name generated using the Faker library.

Rating: An integer value from 1 to 5, representing user feedback or rating.

In [None]:
#A list of 5 features that we want in our dataset
features = [
    "Id",            #Unique identifier for each user (will use UUID)
    "gender",        #Gender of the user (Male/Female)
    "subscriber",    #Boolean value indicating if the user is a subscriber (True/False)
    "name",          #Full name of the user (generated using Faker)
    "rating"         #User's rating or score (random number)
]

#Creating an empty DataFrame with the above-defined features as column headers
df = pd.DataFrame(columns=features)

#Displaying the empty DataFrame
df

Unnamed: 0,Id,gender,subscriber,name,rating


**Generating unique identifiers**

In [None]:
#Generating a list of unique IDs for each user using uuid
#uuid.uuid4().node returns a unique integer
df['Id'] = [uuid.uuid4().node for i in range(num_users)]

#Displaying the DataFrame to see the 'Id' column filled with unique identifiers
df

Unnamed: 0,Id,gender,subscriber,name,rating
0,180202166345049,,,,
1,84425986876768,,,,
2,165332488109656,,,,
3,269172384654805,,,,
4,84221574292360,,,,
...,...,...,...,...,...
95,4701392895017,,,,
96,155011044322127,,,,
97,205794487629728,,,,
98,62823151358740,,,,


**Checking if all IDs are unique**

In [None]:
#Checking if all generated user IDs are unique
#df['Id'].nunique() gives the count of unique IDs in the 'Id' column
#If it matches the total number of users, it returns True
print(df['Id'].nunique()==num_users)

True


**Generating Gender values with Weighted Random Selection**

In [None]:
#Defining possible gender values: male, female, and na (not available)
genders = ["male", "female", "na"]

#Assigning gender to each user using weighted random choice
#45% chance for male, 45% for female, and 10% for 'na'
df['gender'] = random.choices(genders, weights=(45, 45, 10), k=num_users)

#Displaying the DataFrame to see the filled 'gender' column
df

Unnamed: 0,Id,gender,subscriber,name,rating
0,180202166345049,female,,,
1,84425986876768,male,,,
2,165332488109656,na,,,
3,269172384654805,female,,,
4,84221574292360,female,,,
...,...,...,...,...,...
95,4701392895017,female,,,
96,155011044322127,female,,,
97,205794487629728,female,,,
98,62823151358740,female,,,


**Checking Distribution of Gender Values in the Dataset**

In [None]:
df.gender.value_counts()         #Counting the number of users for each gender category

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
female,50
male,40
na,10


**Generating Subscriber Values**

In [None]:
#Subscriber status options: True or False
choice = [True, False]

#Randomly assign subscription status to each user
df['subscriber'] = random.choices(
    choice,
    k=num_users
)
df    #Show updated DataFrame

Unnamed: 0,Id,gender,subscriber,name,rating
0,180202166345049,female,False,,
1,84425986876768,male,False,,
2,165332488109656,na,False,,
3,269172384654805,female,False,,
4,84221574292360,female,True,,
...,...,...,...,...,...
95,4701392895017,female,False,,
96,155011044322127,female,True,,
97,205794487629728,female,True,,
98,62823151358740,female,False,,


**Generating User Names Based on Gender**

In [None]:
#Instantiating Faker
faker = Faker(locale="FR_FR")

#Function to generate a name based on gender
def name_gen(gender):
    if gender == 'male':
        return faker.name_male()         #Male-specific name
    elif gender == 'female':
        return faker.name_female()       #Female-specific name
    return faker.name()            #Default name for 'na' or others

#Generate a name for each user based on their gender
df['name'] = [name_gen(i) for i in df['gender']]

#Show updated DataFrame
df

Unnamed: 0,Id,gender,subscriber,name,rating
0,180202166345049,female,False,Amélie Lévêque,
1,84425986876768,male,False,Éric Renaud,
2,165332488109656,na,False,Eugène du Georges,
3,269172384654805,female,False,Diane Guyon,
4,84221574292360,female,True,Françoise Gomes du Muller,
...,...,...,...,...,...
95,4701392895017,female,False,Marie Couturier Le Renard,
96,155011044322127,female,True,Sabine Payet,
97,205794487629728,female,True,Christiane Sauvage de Morvan,
98,62823151358740,female,False,Alexandrie Boulanger,


**Generating Weighted User Ratings**

In [None]:
#The different ratings available
ratings = [1, 2, 3, 4, 5]

#weighted ratings with a skew towards the end
df['rating']= random.choices(
    ratings,
    weights=(5, 10, 10, 10, 5),    #Ratings are more likely to be 2, 3, or 4
    k=num_users
)
df   #Show updated DataFrame

Unnamed: 0,Id,gender,subscriber,name,rating
0,180202166345049,female,False,Amélie Lévêque,1
1,84425986876768,male,False,Éric Renaud,4
2,165332488109656,na,False,Eugène du Georges,2
3,269172384654805,female,False,Diane Guyon,1
4,84221574292360,female,True,Françoise Gomes du Muller,4
...,...,...,...,...,...
95,4701392895017,female,False,Marie Couturier Le Renard,4
96,155011044322127,female,True,Sabine Payet,1
97,205794487629728,female,True,Christiane Sauvage de Morvan,2
98,62823151358740,female,False,Alexandrie Boulanger,2


**Converting dataframe to dataset**

In [None]:
#Save the DataFrame to a CSV file
df.to_csv('dataset1.csv')

#Read the saved CSV file(ignore the default index column)
csv_df = pd.read_csv('dataset1.csv', index_col=0)

#Display the CSV data
csv_df

Unnamed: 0,Id,gender,subscriber,name,rating
0,180202166345049,female,False,Amélie Lévêque,1
1,84425986876768,male,False,Éric Renaud,4
2,165332488109656,na,False,Eugène du Georges,2
3,269172384654805,female,False,Diane Guyon,1
4,84221574292360,female,True,Françoise Gomes du Muller,4
...,...,...,...,...,...
95,4701392895017,female,False,Marie Couturier Le Renard,4
96,155011044322127,female,True,Sabine Payet,1
97,205794487629728,female,True,Christiane Sauvage de Morvan,2
98,62823151358740,female,False,Alexandrie Boulanger,2
