In [1]:
from faker import Faker
import pandas as pd
import numpy as np
import random

In [2]:
fake = Faker()

def generate_sample_data(n_rows):
    data = {
        "Name": [fake.name() for _ in range(n_rows)],
        "Age": [random.randint(18, 80) for _ in range(n_rows)],
        "Salary": [round(random.uniform(30_000, 150_000), 2) for _ in range(n_rows)],
        "City": [fake.city() for _ in range(n_rows)],
        "Joining_Date": [fake.date_this_decade() for _ in range(n_rows)],
        "Email": [fake.email() for _ in range(n_rows)],
        "Description": [fake.text() for _ in range(n_rows)],
    }
    return pd.DataFrame(data)

In [56]:
df = generate_sample_data(50)
df.head()
# df.to_csv('data.csv', index=False)

Unnamed: 0,Name,Age,Salary,City,Joining_Date,Email,Description
0,Henry Miller,53,61833.45,Gilesshire,2020-02-04,robert14@example.net,Read leg happy forget degree above hundred.
1,Angela Estrada,25,128743.07,Anitaberg,2021-11-25,bmoore@example.com,Price story agency not by must. School also bl...
2,Tiffany Mendoza,73,149451.82,South Ericmouth,2020-04-09,savannahlee@example.com,Will very probably present. Again sure grow ni...
3,Linda Alexander,46,104631.86,Saramouth,2024-06-28,carneybrian@example.org,Situation floor only spring almost baby edge a...
4,Christina Mitchell,69,45145.86,Meganberg,2020-02-27,xspencer@example.com,Both collection money rule.\nStage military le...


In [61]:
filtered_by_salary = df[df["Salary"] > 100_000]
filtered_by_salary.head()

Unnamed: 0,Name,Age,Salary,City,Joining_Date,Email,Description
1,Angela Estrada,25,128743.07,Anitaberg,2021-11-25,bmoore@example.com,Price story agency not by must. School also bl...
2,Tiffany Mendoza,73,149451.82,South Ericmouth,2020-04-09,savannahlee@example.com,Will very probably present. Again sure grow ni...
3,Linda Alexander,46,104631.86,Saramouth,2024-06-28,carneybrian@example.org,Situation floor only spring almost baby edge a...
5,Karen Hurley,78,127454.56,Buchananhaven,2023-04-05,brandonproctor@example.net,Girl clear sell since play doctor current. Res...
7,Debbie Wolf,70,101022.7,Sanchezside,2023-12-06,nhernandez@example.net,North else our answer. Myself Republican draw ...


In [67]:
df["Date"] = pd.to_datetime(df["Joining_Date"])

filtered_by_date = df[df["Date"] > "2023-01-01"]

filtered_by_date.head()

Unnamed: 0,Name,Age,Salary,City,Joining_Date,Email,Description,Date
3,Linda Alexander,46,104631.86,Saramouth,2024-06-28,carneybrian@example.org,Situation floor only spring almost baby edge a...,2024-06-28
5,Karen Hurley,78,127454.56,Buchananhaven,2023-04-05,brandonproctor@example.net,Girl clear sell since play doctor current. Res...,2023-04-05
6,Kirk Lopez,48,70513.3,Carterhaven,2023-12-06,ericgreen@example.org,Two sell right food. Wide analysis security me...,2023-12-06
7,Debbie Wolf,70,101022.7,Sanchezside,2023-12-06,nhernandez@example.net,North else our answer. Myself Republican draw ...,2023-12-06
9,Amanda Townsend,59,67186.28,Denisemouth,2023-10-25,lindamckinney@example.com,Open especially answer deal point attorney los...,2023-10-25


In [68]:
missing_description = df[df["Description"].isna()]

missing_description.head()

Unnamed: 0,Name,Age,Salary,City,Joining_Date,Email,Description,Date


In [69]:
valid_email = df[df["Email"].notna()]

valid_email.head()

Unnamed: 0,Name,Age,Salary,City,Joining_Date,Email,Description,Date
0,Henry Miller,53,61833.45,Gilesshire,2020-02-04,robert14@example.net,Read leg happy forget degree above hundred.,2020-02-04
1,Angela Estrada,25,128743.07,Anitaberg,2021-11-25,bmoore@example.com,Price story agency not by must. School also bl...,2021-11-25
2,Tiffany Mendoza,73,149451.82,South Ericmouth,2020-04-09,savannahlee@example.com,Will very probably present. Again sure grow ni...,2020-04-09
3,Linda Alexander,46,104631.86,Saramouth,2024-06-28,carneybrian@example.org,Situation floor only spring almost baby edge a...,2024-06-28
4,Christina Mitchell,69,45145.86,Meganberg,2020-02-27,xspencer@example.com,Both collection money rule.\nStage military le...,2020-02-27


In [70]:
# Filtering based on munltiple conditions

high_salary_young = df[
    (df["Salary"] > 100_000) & (df["Age"] < 40)
]
high_salary_young.head()

Unnamed: 0,Name,Age,Salary,City,Joining_Date,Email,Description,Date
1,Angela Estrada,25,128743.07,Anitaberg,2021-11-25,bmoore@example.com,Price story agency not by must. School also bl...,2021-11-25
29,Carol Martin,32,133071.55,West Barbara,2021-11-18,blairbruce@example.net,Feeling not gas administration than technology...,2021-11-18
33,Kevin Dominguez,26,108779.96,Katherineville,2022-02-11,madisonharris@example.com,Go never stop. Local officer recent media cand...,2022-02-11
39,Kevin Harrison,27,133119.19,East Carolyn,2021-09-15,martindiane@example.com,Agency or meet long sea contain. Green public ...,2021-09-15
42,Larry Johnson,25,120445.35,East Donnaberg,2023-09-24,zachary61@example.net,Talk others activity piece only tell science.\...,2023-09-24


In [71]:
# Filter rows when a numeric value is in range

age_range = df[df["Age"].between(25, 40)]

age_range.head()

Unnamed: 0,Name,Age,Salary,City,Joining_Date,Email,Description,Date
1,Angela Estrada,25,128743.07,Anitaberg,2021-11-25,bmoore@example.com,Price story agency not by must. School also bl...,2021-11-25
10,Mrs. Stephanie Camacho DVM,35,36686.41,Randallstad,2020-03-29,antoniosteele@example.com,Send tonight American kitchen three admit. Dif...,2020-03-29
15,Andrew Ryan,40,114884.58,North Keithberg,2024-09-17,diane72@example.org,For we cup leader. Project red one.\nLeft view...,2024-09-17
29,Carol Martin,32,133071.55,West Barbara,2021-11-18,blairbruce@example.net,Feeling not gas administration than technology...,2021-11-18
33,Kevin Dominguez,26,108779.96,Katherineville,2022-02-11,madisonharris@example.com,Go never stop. Local officer recent media cand...,2022-02-11


# Selecting some rows and some columns

In [75]:
# TODO

## Working with Missing Data in Pandas


In [77]:
dict = {'First Score':[100, 90, np.nan, 95],
 'Second Score': [30, 45, 56, np.nan],
 'Third Score':[np.nan, 40, 80, 98]}

df = pd.DataFrame(dict)
df.isnull()

Unnamed: 0,First Score,Second Score,Third Score
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [79]:
df = df.fillna(0)
df.head()

Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.0
1,90.0,45.0,40.0
2,0.0,56.0,80.0
3,95.0,0.0,98.0
