In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('items.csv')

In [3]:
df.columns

Index(['item_id', 'description', 'lot', 'price'], dtype='object')

In [4]:
df[df['description'].str.contains('used') & 
   df['description'].str.contains('car')]

Unnamed: 0,item_id,description,lot,price
0,1001,used car 3 years old,A-124-X,24000
2,1003,5-year-old used car,A-039-Y,$18000
4,1005,car/10k mileage/used,B-120-Y,12k


In [5]:
df[df["description"].apply(lambda x: len(x) > 15)]

Unnamed: 0,item_id,description,lot,price
0,1001,used car 3 years old,A-124-X,24000
2,1003,5-year-old used car,A-039-Y,$18000
4,1005,car/10k mileage/used,B-120-Y,12k


In [6]:
df[df["price"].apply(lambda x: x.isnumeric()==True)]

Unnamed: 0,item_id,description,lot,price
0,1001,used car 3 years old,A-124-X,24000
1,1002,brand new car,B-102-X,36000
3,1004,used bicycle,A-200-Y,1200
5,1006,car,B-025-X,14000


In [7]:
df[df["description"].str.count("used") < 1]

Unnamed: 0,item_id,description,lot,price
1,1002,brand new car,B-102-X,36000
5,1006,car,B-025-X,14000


In [8]:
dff=pd.read_csv('pricing.csv')

In [9]:
# Logical operators

dff[dff.cgpa > 3.00]

Unnamed: 0,Name,cgpa,country,code
0,Steve,3.3,usa,4
1,Stephen,4.0,uk,5
2,Farhana,3.9,bangladeshi,16
3,Farjana,3.7,bangladeshi,19
6,Sejuti,3.6,bangladeshi,11
10,Arya,3.2,winterfell,15


In [10]:
# Logical operators also works on string.
# Only comes after 'Shakil' in alphabetical order
dff[dff.Name > 'Shakil']

Unnamed: 0,Name,cgpa,country,code
0,Steve,3.3,usa,4
1,Stephen,4.0,uk,5


In [11]:
# Multiple Logical

dff[(dff.cgpa > 3.50) & (dff.code > 30)]

Unnamed: 0,Name,cgpa,country,code


In [12]:
dff[(dff.cgpa > 3.50) | (dff.code > 30)]

Unnamed: 0,Name,cgpa,country,code
1,Stephen,4.0,uk,5
2,Farhana,3.9,bangladeshi,16
3,Farjana,3.7,bangladeshi,19
6,Sejuti,3.6,bangladeshi,11


In [13]:
names = ['Shakil', 'Mou', 'Steve']

dff[dff.Name.isin(names)]

Unnamed: 0,Name,cgpa,country,code
0,Steve,3.3,usa,4
4,Shakil,2.64,bangladeshi,22


In [14]:
# Str accesor

dff[dff.Name.str.startswith('S')]

Unnamed: 0,Name,cgpa,country,code
0,Steve,3.3,usa,4
1,Stephen,4.0,uk,5
4,Shakil,2.64,bangladeshi,22
6,Sejuti,3.6,bangladeshi,11


In [15]:
dff[dff.Name.str.contains('j')]

Unnamed: 0,Name,cgpa,country,code
3,Farjana,3.7,bangladeshi,19
6,Sejuti,3.6,bangladeshi,11


In [16]:
# this ~ (tilde) used for not 
dff[~dff.Name.str.startswith('S')]

Unnamed: 0,Name,cgpa,country,code
2,Farhana,3.9,bangladeshi,16
3,Farjana,3.7,bangladeshi,19
5,Isabel,2.9,usa,22
7,John,2.5,winterfell,18
8,Awdrita,2.5,bangladeshi,6
9,Erin,2.62,uk,20
10,Arya,3.2,winterfell,15


In [17]:
# Query 

dff.query('cgpa > 3.20 & country == "bangladeshi"')

Unnamed: 0,Name,cgpa,country,code
2,Farhana,3.9,bangladeshi,16
3,Farjana,3.7,bangladeshi,19
6,Sejuti,3.6,bangladeshi,11


In [18]:
dff.nlargest(2, 'cgpa')

Unnamed: 0,Name,cgpa,country,code
1,Stephen,4.0,uk,5
2,Farhana,3.9,bangladeshi,16


In [19]:
# Group by

In [20]:
df = pd.DataFrame({
    "country" : ["United States", "Canada", "United Kingdom", "France", "Germany", "China", "Japan", "South Korea"],
    "continent" : ["North America", "North America", "Europe", "Europe", "Europe", "Asia", "Asia", "Asia"],
    "population" : [332722557, 38711108, 67081234, 67853000, 83222442, 1412600000, 125502000, 51745000],
    "area" : [3796742, 3855100, 93628, 247368, 137882, 3705407, 145937, 38690],
    "population percentage of world": [4.18, 0.487, 0.843, 0.853, 1.05, 17.8, 1.58, 0.650]
})

In [21]:
df.groupby('continent').count()

Unnamed: 0_level_0,country,population,area,population percentage of world
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Asia,3,3,3,3
Europe,3,3,3,3
North America,2,2,2,2


In [22]:
agg = df.groupby("continent").agg({"country": "count", "population": ["sum", "min", "max"]})
agg

Unnamed: 0_level_0,country,population,population,population
Unnamed: 0_level_1,count,sum,min,max
continent,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Asia,3,1589847000,51745000,1412600000
Europe,3,218156676,67081234,83222442
North America,2,371433665,38711108,332722557


In [23]:
def process_continent(continent):
    result = {}
    if continent["population percentage of world"].sum() > 3:
        result["population"] = continent["population"].sum()
        result["population density"] = result["population"] / continent["area"].sum()
    return pd.Series(result, index = ["population", "population density"], dtype = "float64")
df_density = df.groupby("continent").apply(process_continent)
df_density

Unnamed: 0_level_0,population,population density
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Asia,1589847000.0,408.697456
Europe,,
North America,371433700.0,48.541732
