In [1]:
# Load Pandas using the name pd

import pandas as pd

# Load our dataset

In [2]:
berlin_names = pd.read_csv('./berlin_names.csv')

# Let's look at some of its properties

In [4]:
# First the dimensions. The number os rows (29078) and columns (6)
berlin_names.shape

(29078, 6)

In [6]:
# Let's see the name of the columns
berlin_names.columns

Index(['Unnamed: 0', 'vorname', 'anzahl', 'geschlecht', 'bezirke',
       'name length'],
      dtype='object')

In [7]:
# The 'Unnamed: 0' is not interesting for us (it has just the number of the lines). We can drop it.
berlin_names = berlin_names.drop('Unnamed: 0', axis=1)

# Notice that you need to assign again the new dataframe to the variable 'berlin_names', otherwise berlin_name 
# would still have the undesired column.

In [8]:
# Look to the first 5 rows
berlin_names.head()

Unnamed: 0,vorname,anzahl,geschlecht,bezirke,name length
0,Charlotte,64,w,friedrichshain-kreuzberg,9
1,Marie,51,w,friedrichshain-kreuzberg,5
2,Sophie,50,w,friedrichshain-kreuzberg,6
3,Anton,42,m,friedrichshain-kreuzberg,5
4,Emma,42,w,friedrichshain-kreuzberg,4


In [9]:
# Or to the first N rows, passing a numeric argument to head method
N = 11
berlin_names.head(N)

Unnamed: 0,vorname,anzahl,geschlecht,bezirke,name length
0,Charlotte,64,w,friedrichshain-kreuzberg,9
1,Marie,51,w,friedrichshain-kreuzberg,5
2,Sophie,50,w,friedrichshain-kreuzberg,6
3,Anton,42,m,friedrichshain-kreuzberg,5
4,Emma,42,w,friedrichshain-kreuzberg,4
5,Maria,42,w,friedrichshain-kreuzberg,5
6,Emil,41,m,friedrichshain-kreuzberg,4
7,Ali,40,m,friedrichshain-kreuzberg,3
8,Alexander,39,m,friedrichshain-kreuzberg,9
9,Anna,39,w,friedrichshain-kreuzberg,4


In [10]:
# You can look at the last 5 rows of your dataset too with the method tail:
berlin_names.tail()

Unnamed: 0,vorname,anzahl,geschlecht,bezirke,name length
29073,Öykü,1,w,neukoelln,4
29074,Özlem,1,w,neukoelln,5
29075,Đeni,1,w,neukoelln,4
29076,Şeyma,1,w,neukoelln,5
29077,Ștefania,1,w,neukoelln,8


In [13]:
# The types of our columns. Pandas use 'object' for strings (textual data), int64 for integers (1, 2, 3...), 
# float64 for real numbers (3.1415...)
berlin_names.dtypes

vorname        object
anzahl          int64
geschlecht     object
bezirke        object
name length     int64
dtype: object

In [15]:
# This show the index of our data. Our data is indexed by line number, in this case, a RangeIndex pandas object
berlin_names.index

RangeIndex(start=0, stop=29078, step=1)

# Some basic statistics

In [11]:
berlin_names.describe()

Unnamed: 0,anzahl,name length
count,29078.0,29078.0
mean,2.390467,5.857556
std,4.516186,1.937816
min,1.0,1.0
25%,1.0,5.0
50%,1.0,5.0
75%,2.0,7.0
max,112.0,19.0


**anzahl** is the number of persosn with a given name. The mean 2.39, signifies that in average a name have a little bit more than two persons with it living in Berlin.

On average, the **name length** is 5.85 letters.

The maximum number of persons with the same name is 112, and the longest name has 19 letters.

# Accessing our data

In [17]:
# Just pass to iloc the row number
berlin_names.iloc[0]

vorname                       Charlotte
anzahl                               64
geschlecht                            w
bezirke        friedrichshain-kreuzberg
name length                           9
Name: 0, dtype: object

In [18]:
# If you want to see all the content in a column, just pass the column name like:
berlin_names['vorname']

0               Charlotte
1                   Marie
2                  Sophie
3                   Anton
4                    Emma
5                   Maria
6                    Emil
7                     Ali
8               Alexander
9                    Anna
10                   Leon
11                    Mia
12                  Elias
13                   Karl
14                   Paul
15                  Oskar
16                  Luise
17                  Felix
18                   Ella
19                   Noah
20                Johanna
21                  Henry
22                  Jonas
23                  Paula
24               Jonathan
25             Maximilian
26                 Emilia
27                   Mira
28                  Jakob
29               Valentin
               ...       
29048               Çinar
29049              Yvaine
29050          Ömer-İlkan
29051              Zaynab
29052                Đình
29053    Zeinab-Seraphine
29054             Živorad
29055       

In [19]:
# You can access more than one row at the same time, giving a range like 0:10, meaning 'from 0 to 10':
berlin_names[0:10]

Unnamed: 0,vorname,anzahl,geschlecht,bezirke,name length
0,Charlotte,64,w,friedrichshain-kreuzberg,9
1,Marie,51,w,friedrichshain-kreuzberg,5
2,Sophie,50,w,friedrichshain-kreuzberg,6
3,Anton,42,m,friedrichshain-kreuzberg,5
4,Emma,42,w,friedrichshain-kreuzberg,4
5,Maria,42,w,friedrichshain-kreuzberg,5
6,Emil,41,m,friedrichshain-kreuzberg,4
7,Ali,40,m,friedrichshain-kreuzberg,3
8,Alexander,39,m,friedrichshain-kreuzberg,9
9,Anna,39,w,friedrichshain-kreuzberg,4


# Filtering your data

In [22]:
# This returns a new Pandas object (Series), just with values True or False, for each row that
# has or hasn't respectively, the property that yor are testing. In the case below, rows with
# 'anzahl' bigger than 40:
berlin_names['anzahl'] > 40

0         True
1         True
2         True
3         True
4         True
5         True
6         True
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
29048    False
29049    False
29050    False
29051    False
29052    False
29053    False
29054    False
29055    False
29056    False
29057    False
29058    False
29059    False
29060    False
29061    False
29062    False
29063    False
29064    False
29065    False
29066    False
29067    False
29068    False
29069    False
29070    False
29071    False
29072    False
29073    False
29074    False
29075    False
29076    False
29077    False
Name: anzahl, dtype: bool

In [23]:
# You can save this series and...
more_than_40 = berlin_names['anzahl'] > 40

In [24]:
# ...use like a index in your Dataframe
berlin_names[more_than_40]

Unnamed: 0,vorname,anzahl,geschlecht,bezirke,name length
0,Charlotte,64,w,friedrichshain-kreuzberg,9
1,Marie,51,w,friedrichshain-kreuzberg,5
2,Sophie,50,w,friedrichshain-kreuzberg,6
3,Anton,42,m,friedrichshain-kreuzberg,5
4,Emma,42,w,friedrichshain-kreuzberg,4
5,Maria,42,w,friedrichshain-kreuzberg,5
6,Emil,41,m,friedrichshain-kreuzberg,4
3520,Marie,94,w,tempelhof-schoeneberg,5
3521,Sophie,89,w,tempelhof-schoeneberg,6
3522,Maria,82,w,tempelhof-schoeneberg,5


In [26]:
# You could have filtere directly inside de []:
berlin_names[berlin_names['anzahl'] > 40]

Unnamed: 0,vorname,anzahl,geschlecht,bezirke,name length
0,Charlotte,64,w,friedrichshain-kreuzberg,9
1,Marie,51,w,friedrichshain-kreuzberg,5
2,Sophie,50,w,friedrichshain-kreuzberg,6
3,Anton,42,m,friedrichshain-kreuzberg,5
4,Emma,42,w,friedrichshain-kreuzberg,4
5,Maria,42,w,friedrichshain-kreuzberg,5
6,Emil,41,m,friedrichshain-kreuzberg,4
3520,Marie,94,w,tempelhof-schoeneberg,5
3521,Sophie,89,w,tempelhof-schoeneberg,6
3522,Maria,82,w,tempelhof-schoeneberg,5


In [28]:
# You can apply more than one filter with &. Don't forget the parenthesis! 

berlin_names[(berlin_names['anzahl'] > 40) & (berlin_names['anzahl'] < 50)]

Unnamed: 0,vorname,anzahl,geschlecht,bezirke,name length
3,Anton,42,m,friedrichshain-kreuzberg,5
4,Emma,42,w,friedrichshain-kreuzberg,4
5,Maria,42,w,friedrichshain-kreuzberg,5
6,Emil,41,m,friedrichshain-kreuzberg,4
3529,Elias,49,m,tempelhof-schoeneberg,5
3530,Emil,49,m,tempelhof-schoeneberg,4
3531,Maximilian,49,m,tempelhof-schoeneberg,10
3532,Oskar,46,m,tempelhof-schoeneberg,5
3533,Felix,44,m,tempelhof-schoeneberg,5
3534,Noah,44,m,tempelhof-schoeneberg,4


# Couting values

With the method value_counts, pandas can count how many different names we have in each neighborhood:

In [27]:
berlin_names['bezirke'].value_counts()

mitte                         3850
tempelhof-schoeneberg         3636
charlottenburg-wilmersdorf    3550
friedrichshain-kreuzberg      3520
neukoelln                     2750
pankow                        2555
spandau                       2553
lichtenberg                   2187
reinickendorf                 1221
treptow-koepenick             1128
steglitz-zehlendorf           1089
marzahn-hellersdorf           1039
Name: bezirke, dtype: int64

# Creating a function for common dataframe operations

In [29]:
# We can create a function for reusing the query above, but now you can use with different number of persons:
def more_than(number_persons):
    return berlin_names[berlin_names['anzahl'] >= number_persons]

In [32]:
# Names with more than 80 persons with it
more_than(80)

Unnamed: 0,vorname,anzahl,geschlecht,bezirke,name length
3520,Marie,94,w,tempelhof-schoeneberg,5
3521,Sophie,89,w,tempelhof-schoeneberg,6
3522,Maria,82,w,tempelhof-schoeneberg,5
15746,Marie,112,w,pankow,5
15747,Sophie,84,w,pankow,6
15748,Charlotte,81,w,pankow,9
18301,Marie,111,w,charlottenburg-wilmersdorf,5
18302,Sophie,105,w,charlottenburg-wilmersdorf,6


# Homework

1. How many persons with your name exist in Berlin?
2. Where is the neighborhood with more different names?
3. How many persons exist in total on our dataset?
4. Whats is the most common name in each neighborhood?
5. Hoe many men and women exist in our dataset?