# Import Pandas Library

In [1]:
import pandas as pd
import numpy as np

# Data Structures

## First: Series

In [2]:
lst = [1,2,3,4,5] 
s = pd.Series(lst, name="value")
s

0    1
1    2
2    3
3    4
4    5
Name: value, dtype: int64

In [3]:
# change dtype
s = pd.Series(lst, dtype= float)
s

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [4]:
# choose indexes
s = pd.Series(lst, index=['A', 'B', 'C', 'D', 'E' ])
s

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [5]:
index = [10, 12, 13, 14]
names = ['Nourah', 'Sarah', 'Ahmed', 'Lama']
s2 = pd.Series(names, index=index)
s2

10    Nourah
12     Sarah
13     Ahmed
14      Lama
dtype: object

In [6]:
# use dictionary 
names = {10: 'Nourah', 12: 'Sarah', 13: 'Ahmed', 14: 'Lama'}
s3 = pd.Series(names)
s3

10    Nourah
12     Sarah
13     Ahmed
14      Lama
dtype: object

In [7]:
names.keys()

dict_keys([10, 12, 13, 14])

In [8]:
pd.Series(names.keys(), index=names.values())

Nourah    10
Sarah     12
Ahmed     13
Lama      14
dtype: int64

In [9]:
s3

10    Nourah
12     Sarah
13     Ahmed
14      Lama
dtype: object

In [10]:
s3.values

array(['Nourah', 'Sarah', 'Ahmed', 'Lama'], dtype=object)

In [11]:
list(range(4))

[0, 1, 2, 3]

In [12]:
s3

10    Nourah
12     Sarah
13     Ahmed
14      Lama
dtype: object

In [13]:
# change the indexes
s3.index = range(4)
s3[0:3]

0    Nourah
1     Sarah
2     Ahmed
dtype: object

In [14]:
# Selecting
print(s3[2]) # index = 2

Ahmed


In [15]:
# Slicing like numpy [start, end(execluded), gap]
# Note that the slice does not use the index labels as references, but the position
s3[:3] # from position 0 to 2

0    Nourah
1     Sarah
2     Ahmed
dtype: object

In [16]:
s3

0    Nourah
1     Sarah
2     Ahmed
3      Lama
dtype: object

In [17]:
s3[:-1] # from 0 to last item(execluded)

0    Nourah
1     Sarah
2     Ahmed
dtype: object

In [18]:
s3

0    Nourah
1     Sarah
2     Ahmed
3      Lama
dtype: object

In [19]:
s4 = pd.Series(['Naif','Dania'])
s3 = s3._append(s4)
s3

0    Nourah
1     Sarah
2     Ahmed
3      Lama
0      Naif
1     Dania
dtype: object

In [20]:
s3[0]

0    Nourah
0      Naif
dtype: object

In [21]:
s3.to_dict()

{0: 'Naif', 1: 'Dania', 2: 'Ahmed', 3: 'Lama'}

In [22]:
# Add elements 
s4 = pd.Series({50: 'Naif', 60: 'Dania'})
s3 = s3._append(s4)
s3

0     Nourah
1      Sarah
2      Ahmed
3       Lama
0       Naif
1      Dania
50      Naif
60     Dania
dtype: object

In [23]:
x = pd.Series(['a', 'b'])
y = pd.Series(['c', 'd'])

In [24]:
x

0    a
1    b
dtype: object

In [25]:
y

0    c
1    d
dtype: object

In [26]:
z = pd.concat([x, y])
z

0    a
1    b
0    c
1    d
dtype: object

In [27]:
z.reset_index(inplace=True, drop=True)
z

0    a
1    b
2    c
3    d
dtype: object

In [28]:
z = z.reset_index(drop=True)
z

0    a
1    b
2    c
3    d
dtype: object

In [29]:
z

0    a
1    b
2    c
3    d
dtype: object

In [30]:
z = pd.concat([x, y],ignore_index=True )
z

0    a
1    b
2    c
3    d
dtype: object

In [31]:
s3

0     Nourah
1      Sarah
2      Ahmed
3       Lama
0       Naif
1      Dania
50      Naif
60     Dania
dtype: object

In [32]:
s3[1:4]

1    Sarah
2    Ahmed
3     Lama
dtype: object

In [33]:
s3[1:4].index

Index([1, 2, 3], dtype='int64')

In [34]:
s3.drop(index=s3[1:4].index)

0     Nourah
0       Naif
50      Naif
60     Dania
dtype: object

In [35]:
s3

0     Nourah
1      Sarah
2      Ahmed
3       Lama
0       Naif
1      Dania
50      Naif
60     Dania
dtype: object

In [36]:
# delete an element 
s3 = s3.drop(s3[:1].index)

In [37]:
s3[:1]

1    Sarah
dtype: object

In [38]:
s3

1     Sarah
2     Ahmed
3      Lama
1     Dania
50     Naif
60    Dania
dtype: object

In [39]:
# drop duplicate elements
s3 = s3.drop_duplicates()
s3

1     Sarah
2     Ahmed
3      Lama
1     Dania
50     Naif
dtype: object

In [40]:
s

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [41]:
s4 = s.copy()
s4

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [42]:
s4 = s4+3
s4

A    4
B    5
C    6
D    7
E    8
dtype: int64

In [43]:
s4.add(s)

A     5
B     7
C     9
D    11
E    13
dtype: int64

In [44]:
s

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [45]:
s

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [46]:
s5 = pd.Series({'A': 6, 'B': 8})
s5

A    6
B    8
dtype: int64

In [47]:
s5 = s5.add(s, fill_value=0)

In [48]:
s5 # you have to save the result

A     7.0
B    10.0
C     3.0
D     4.0
E     5.0
dtype: float64

In [49]:
s4

A    4
B    5
C    6
D    7
E    8
dtype: int64

In [50]:
s4.sub(s)

A    3
B    3
C    3
D    3
E    3
dtype: int64

In [51]:
s4.mul(s)

A     4
B    10
C    18
D    28
E    40
dtype: int64

In [52]:
s4.div(s)

A    4.00
B    2.50
C    2.00
D    1.75
E    1.60
dtype: float64

## Second: DataFrame

### A- Creating a new DataFrame from the scratch

In [None]:
s4 = pd.Series({50: 'Naif', 60: 'Dania'})

In [55]:
data = {'SalesPerson': ['Kathey', 'Michael', 'William', 'Kathey', 'William', 'Kathey', 'Michael'],
        'Region': ['East', 'West', 'North', 'South', 'North', 'North', 'East'],
        'OrderAmount': [600, 700, 400, 500, 400, 700, 800],
        'Month': ['Jan', 'Feb', 'Feb', 'Mar', 'May', 'Apr', 'May'],
        'isAccepted': [True, False, False, True, True, True, False]
       }

SalesDF = pd.DataFrame(data, index= range(10, 17))
SalesDF   

Unnamed: 0,SalesPerson,Region,OrderAmount,Month,isAccepted
10,Kathey,East,600,Jan,True
11,Michael,West,700,Feb,False
12,William,North,400,Feb,False
13,Kathey,South,500,Mar,True
14,William,North,400,May,True
15,Kathey,North,700,Apr,True
16,Michael,East,800,May,False


In [59]:
SalesDF[['SalesPerson']]

Unnamed: 0,SalesPerson
10,Kathey
11,Michael
12,William
13,Kathey
14,William
15,Kathey
16,Michael


In [60]:
data = {'year': [2010, 2011, 2012, 2010, 2011, 2012, 2010, 2011, 2012],
        'team': ['FCBarcelona', 'FCBarcelona', 'FCBarcelona', 'RMadrid', 'RMadrid', 'RMadrid', 'ValenciaCF',
                 'ValenciaCF', 'ValenciaCF'],
        'wins':   [30, 28, 32, 29, 32, 26, 21, 17, 19],
        'draws':  [6, 7, 4, 5, 4, 7, 8, 10, 8],
        'losses': [2, 3, 2, 4, 2, 5, 9, 11, 11]}

football = pd.DataFrame(data)
football   

Unnamed: 0,year,team,wins,draws,losses
0,2010,FCBarcelona,30,6,2
1,2011,FCBarcelona,28,7,3
2,2012,FCBarcelona,32,4,2
3,2010,RMadrid,29,5,4
4,2011,RMadrid,32,4,2
5,2012,RMadrid,26,7,5
6,2010,ValenciaCF,21,8,9
7,2011,ValenciaCF,17,10,11
8,2012,ValenciaCF,19,8,11


### B- Reading tabular data

In [61]:
edu = pd.read_csv('Data/educ_figdp_1_Data.csv')
edu

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e
...,...,...,...,...,...
379,2007,Finland,Total public expenditure on education as % of ...,5.90,
380,2008,Finland,Total public expenditure on education as % of ...,6.10,
381,2009,Finland,Total public expenditure on education as % of ...,6.81,
382,2010,Finland,Total public expenditure on education as % of ...,6.85,


In [62]:
edu.shape

(384, 5)

In [63]:
edu.columns

Index(['TIME', 'GEO', 'INDIC_ED', 'Value', 'Flag and Footnotes'], dtype='object')

In [64]:
edu.size

1920

In [66]:
type(edu)

pandas.core.frame.DataFrame

In [65]:
edu.dtypes

TIME                   int64
GEO                   object
INDIC_ED              object
Value                 object
Flag and Footnotes    object
dtype: object

# Viewing Data

In [67]:
edu.head() #first rows that are listed

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e


In [69]:
edu.head(300)

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e
...,...,...,...,...,...
295,2007,Netherlands,Total public expenditure on education as % of ...,5.32,
296,2008,Netherlands,Total public expenditure on education as % of ...,5.50,
297,2009,Netherlands,Total public expenditure on education as % of ...,5.95,
298,2010,Netherlands,Total public expenditure on education as % of ...,5.98,


In [70]:
edu.shape

(384, 5)

In [71]:
edu.tail() #last rows that are listed

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
379,2007,Finland,Total public expenditure on education as % of ...,5.9,
380,2008,Finland,Total public expenditure on education as % of ...,6.1,
381,2009,Finland,Total public expenditure on education as % of ...,6.81,
382,2010,Finland,Total public expenditure on education as % of ...,6.85,
383,2011,Finland,Total public expenditure on education as % of ...,6.76,


In [73]:
edu.columns = ['TIME', 'GEO_2', 'A', 'Value', 'Flag and Footnotes']

In [74]:
edu.head(2)

Unnamed: 0,TIME,GEO_2,A,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,


In [77]:
edu = edu.rename(columns={"GEO_2": "GEO"})

In [78]:
edu.head(2)

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,


In [80]:
edu.columns[2]

'A'

In [86]:
edu.index = range(5, 389)

In [88]:
edu

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
5,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
6,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
7,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
8,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
9,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e
...,...,...,...,...,...
384,2007,Finland,Total public expenditure on education as % of ...,5.90,
385,2008,Finland,Total public expenditure on education as % of ...,6.10,
386,2009,Finland,Total public expenditure on education as % of ...,6.81,
387,2010,Finland,Total public expenditure on education as % of ...,6.85,


In [87]:
edu.index

RangeIndex(start=5, stop=389, step=1)

In [91]:
edu.values # values of any DataFrame can be retrieved as a Python array by calling its values attribute.

array([[2000, 'European Union (28 countries)',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        ':', nan],
       [2001, 'European Union (28 countries)',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        ':', nan],
       [2002, 'European Union (28 countries)',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '5.00', 'e'],
       ...,
       [2009, 'Finland',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '6.81', nan],
       [2010, 'Finland',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '6.85', nan],
       [2011, 'Finland',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '6.76', nan]], dtype=object)

In [92]:
edu.head(2)

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
5,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
6,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,


In [93]:
edu.dtypes

TIME                   int64
GEO                   object
A                     object
Value                 object
Flag and Footnotes    object
dtype: object

In [94]:
edu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 5 to 388
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   TIME                384 non-null    int64 
 1   GEO                 384 non-null    object
 2   A                   384 non-null    object
 3   Value               384 non-null    object
 4   Flag and Footnotes  165 non-null    object
dtypes: int64(1), object(4)
memory usage: 15.1+ KB


In [95]:
# quick statistical information
edu.describe()

Unnamed: 0,TIME
count,384.0
mean,2005.5
std,3.456556
min,2000.0
25%,2002.75
50%,2005.5
75%,2008.25
max,2011.0


In [96]:
edu.describe(include=[object])

Unnamed: 0,GEO,A,Value,Flag and Footnotes
count,384,384,384,165
unique,32,1,211,6
top,European Union (28 countries),Total public expenditure on education as % of ...,:,e
freq,12,384,23,70


In [101]:
edu.describe(exclude="number")

Unnamed: 0,GEO,A,Value,Flag and Footnotes
count,384,384,384,165
unique,32,1,211,6
top,European Union (28 countries),Total public expenditure on education as % of ...,:,e
freq,12,384,23,70


In [98]:
edu.T

Unnamed: 0,5,6,7,8,9,10,11,12,13,14,...,379,380,381,382,383,384,385,386,387,388
TIME,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
GEO,European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),...,Finland,Finland,Finland,Finland,Finland,Finland,Finland,Finland,Finland,Finland
A,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...
Value,:,:,5.00,5.03,4.95,4.92,4.91,4.92,5.04,5.38,...,6.22,6.43,6.42,6.30,6.18,5.90,6.10,6.81,6.85,6.76
Flag and Footnotes,,,e,e,e,e,e,e,e,e,...,,,,,,,,,,


# Selection

In [102]:
edu.head(2)

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
5,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
6,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,


In [103]:
edu['Value']

5         :
6         :
7      5.00
8      5.03
9      4.95
       ... 
384    5.90
385    6.10
386    6.81
387    6.85
388    6.76
Name: Value, Length: 384, dtype: object

In [104]:
type(edu['Value']) # The result will be a Series data structure, not a DataFrame, because only one column is retrieved.

pandas.core.series.Series

In [105]:
edu[['Value','GEO']]

Unnamed: 0,Value,GEO
5,:,European Union (28 countries)
6,:,European Union (28 countries)
7,5.00,European Union (28 countries)
8,5.03,European Union (28 countries)
9,4.95,European Union (28 countries)
...,...,...
384,5.90,Finland
385,6.10,Finland
386,6.81,Finland
387,6.85,Finland


In [106]:
edu[10:14] # select a subset of rows from a DataFrame

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
15,2010,European Union (28 countries),Total public expenditure on education as % of ...,5.41,e
16,2011,European Union (28 countries),Total public expenditure on education as % of ...,5.25,e
17,2000,European Union (27 countries),Total public expenditure on education as % of ...,4.91,s
18,2001,European Union (27 countries),Total public expenditure on education as % of ...,4.99,s


In [112]:
edu.head()

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
5,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
6,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
7,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
8,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
9,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e


In [108]:
edu.iloc[ 0:3 , 0:2]

Unnamed: 0,TIME,GEO
5,2000,European Union (28 countries)
6,2001,European Union (28 countries)
7,2002,European Union (28 countries)


In [114]:
edu.loc[ [5, 21, 32] , ['Value','GEO']]  #[rows, columns]

Unnamed: 0,Value,GEO
5,:,European Union (28 countries)
21,4.95,European Union (27 countries)
32,5.06,European Union (25 countries)


In [115]:
edu.loc[90:94,:] #[rows, columns=all]

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
90,2001,Belgium,Total public expenditure on education as % of ...,5.99,i
91,2002,Belgium,Total public expenditure on education as % of ...,6.09,d
92,2003,Belgium,Total public expenditure on education as % of ...,6.02,d
93,2004,Belgium,Total public expenditure on education as % of ...,5.95,d
94,2005,Belgium,Total public expenditure on education as % of ...,5.92,d


In [116]:
edu.head()

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
5,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
6,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
7,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
8,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
9,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e


In [117]:
edu.sample(5, random_state=23) # random sample >> 23 seed for random number generator.
# seed makes the random numbers predictable

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
299,2006,Netherlands,Total public expenditure on education as % of ...,5.50,
348,2007,Romania,Total public expenditure on education as % of ...,4.25,
183,2010,Greece,Total public expenditure on education as % of ...,:,
78,2001,Euro area (13 countries),Total public expenditure on education as % of ...,4.97,s
289,2008,Malta,Total public expenditure on education as % of ...,5.72,i


# Filtering Data

In [125]:
edu.columns

Index(['TIME', 'GEO', 'A', 'Value', 'Flag and Footnotes'], dtype='object')

In [124]:
edu[edu['TIME'] == 2004]

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
9,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e
21,2004,European Union (27 countries),Total public expenditure on education as % of ...,4.95,e
33,2004,European Union (25 countries),Total public expenditure on education as % of ...,4.98,e
45,2004,Euro area (18 countries),Total public expenditure on education as % of ...,4.8,e
57,2004,Euro area (17 countries),Total public expenditure on education as % of ...,4.8,e
69,2004,Euro area (15 countries),Total public expenditure on education as % of ...,4.96,e
81,2004,Euro area (13 countries),Total public expenditure on education as % of ...,4.95,e
93,2004,Belgium,Total public expenditure on education as % of ...,5.95,d
105,2004,Bulgaria,Total public expenditure on education as % of ...,4.4,
117,2004,Czech Republic,Total public expenditure on education as % of ...,4.2,


In [120]:
edu['TIME'] == 2004

5      False
6      False
7      False
8      False
9       True
       ...  
384    False
385    False
386    False
387    False
388    False
Name: TIME, Length: 384, dtype: bool

In [126]:
edu[edu['TIME'] == 2011]

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
16,2011,European Union (28 countries),Total public expenditure on education as % of ...,5.25,e
28,2011,European Union (27 countries),Total public expenditure on education as % of ...,5.25,e
40,2011,European Union (25 countries),Total public expenditure on education as % of ...,5.31,e
52,2011,Euro area (18 countries),Total public expenditure on education as % of ...,5.15,e
64,2011,Euro area (17 countries),Total public expenditure on education as % of ...,5.15,e
76,2011,Euro area (15 countries),Total public expenditure on education as % of ...,5.16,e
88,2011,Euro area (13 countries),Total public expenditure on education as % of ...,5.15,e
100,2011,Belgium,Total public expenditure on education as % of ...,6.55,d
112,2011,Bulgaria,Total public expenditure on education as % of ...,3.82,
124,2011,Czech Republic,Total public expenditure on education as % of ...,4.51,


In [127]:
# Another way of selection
# by applying Boolean indexing. This indexing is commonly known as a filter. 
edu[edu['Value'] > 6.5]

TypeError: '>' not supported between instances of 'str' and 'float'

# Filtering Missing and dupliacated Values

In [131]:
edu['TIME'] >=2007

5      False
6      False
7      False
8      False
9      False
       ...  
384     True
385     True
386     True
387     True
388     True
Name: TIME, Length: 384, dtype: bool

In [136]:
edu[(edu['Value'] != ':') & (edu['TIME'] >=2007)]

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
12,2007,European Union (28 countries),Total public expenditure on education as % of ...,4.92,e
13,2008,European Union (28 countries),Total public expenditure on education as % of ...,5.04,e
14,2009,European Union (28 countries),Total public expenditure on education as % of ...,5.38,e
15,2010,European Union (28 countries),Total public expenditure on education as % of ...,5.41,e
16,2011,European Union (28 countries),Total public expenditure on education as % of ...,5.25,e
...,...,...,...,...,...
384,2007,Finland,Total public expenditure on education as % of ...,5.90,
385,2008,Finland,Total public expenditure on education as % of ...,6.10,
386,2009,Finland,Total public expenditure on education as % of ...,6.81,
387,2010,Finland,Total public expenditure on education as % of ...,6.85,


In [132]:
(edu['Value'] == ':') & (edu['TIME'] >=2007)

5      False
6      False
7      False
8      False
9      False
       ...  
384    False
385    False
386    False
387    False
388    False
Length: 384, dtype: bool

In [None]:
edu[(~edu['Flag and Footnotes'].isnull()) & (edu['TIME'] == 2011) ]

In [140]:
edu[edu['Flag and Footnotes'].isnull()]

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
5,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
6,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
41,2000,Euro area (18 countries),Total public expenditure on education as % of ...,:,
42,2001,Euro area (18 countries),Total public expenditure on education as % of ...,:,
53,2000,Euro area (17 countries),Total public expenditure on education as % of ...,:,
...,...,...,...,...,...
384,2007,Finland,Total public expenditure on education as % of ...,5.90,
385,2008,Finland,Total public expenditure on education as % of ...,6.10,
386,2009,Finland,Total public expenditure on education as % of ...,6.81,
387,2010,Finland,Total public expenditure on education as % of ...,6.85,


In [142]:
edu.isnull().sum()

TIME                    0
GEO                     0
A                       0
Value                   0
Flag and Footnotes    219
dtype: int64

In [None]:
edu[edu['Value'].isnull()].head()

In [145]:
edu[edu.duplicated(['Value'])]

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
6,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
12,2007,European Union (28 countries),Total public expenditure on education as % of ...,4.92,e
17,2000,European Union (27 countries),Total public expenditure on education as % of ...,4.91,s
19,2002,European Union (27 countries),Total public expenditure on education as % of ...,5.00,e
20,2003,European Union (27 countries),Total public expenditure on education as % of ...,5.04,e
...,...,...,...,...,...
374,2009,Slovakia,Total public expenditure on education as % of ...,4.09,d
375,2010,Slovakia,Total public expenditure on education as % of ...,4.22,d
380,2003,Finland,Total public expenditure on education as % of ...,6.43,
383,2006,Finland,Total public expenditure on education as % of ...,6.18,


In [None]:
edu[edu.duplicated()]

In [None]:
edu.drop_duplicates('Value')

# Manipulating Data

In [146]:
edu.head()

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes
5,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
6,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
7,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
8,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
9,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e


In [None]:
edu['Value'].head()

In [148]:
edu['TIME_100'] = edu['TIME'] / 100 # you can apply it by one step

In [150]:
edu.head(2)

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes,TIME_100
5,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,,20.0
6,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,,20.01


In [None]:
def y(x):
    return x*2

In [151]:
y = lambda x: x*2
y(3)

6

In [156]:
edu['TIME'] = edu['TIME'].map(y)

In [157]:
edu.head(2)

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes,TIME_100
5,8000,European Union (28 countries),Total public expenditure on education as % of ...,:,,20.0
6,8004,European Union (28 countries),Total public expenditure on education as % of ...,:,,20.01


In [None]:
def z(t, v):
    if v == ':':
        return t*2
    else:
        return t

In [173]:
z = lambda t, v: t*2 if v==":" else t
z(2, ":")

4

In [175]:
edu['new'] = edu[['TIME', 'Value']].apply(lambda x:  x[0]*2 if x[1]==":" else x[0] , axis=1)

  edu['new'] = edu[['TIME', 'Value']].apply(lambda x:  x[0]*2 if x[1]==":" else x[0] , axis=1)


In [176]:
edu

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes,TIME_100,new
5,8000,European Union (28 countries),Total public expenditure on education as % of ...,:,,20.00,16000
6,8004,European Union (28 countries),Total public expenditure on education as % of ...,:,,20.01,16008
7,8008,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e,20.02,8008
8,8012,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e,20.03,8012
9,8016,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e,20.04,8016
...,...,...,...,...,...,...,...
384,8028,Finland,Total public expenditure on education as % of ...,5.90,,20.07,8028
385,8032,Finland,Total public expenditure on education as % of ...,6.10,,20.08,8032
386,8036,Finland,Total public expenditure on education as % of ...,6.81,,20.09,8036
387,8040,Finland,Total public expenditure on education as % of ...,6.85,,20.10,8040


In [178]:
# we can apply any function to a DataFrame or Series
edu['TIME'].apply(np.sqrt) # sqrt function from the numpy library

5      89.442719
6      89.465077
7      89.487429
8      89.509776
9      89.532117
         ...    
384    89.599107
385    89.621426
386    89.643739
387    89.666047
388    89.688349
Name: TIME, Length: 384, dtype: float64

In [None]:
def f2(x, r ):
    if x > 2002 and r == ":":
        return 1
    else:
        return 0

In [None]:
edu['new'] = edu[['TIME', 'Value']].apply(lambda c : f2(c[0],c[1]), axis=1)

In [None]:
edu.drop('TIME_1', axis=1, inplace=True)

In [None]:
edu.head(2)

In [None]:
def f2(x):
    return x**2
edu['Value'].apply(f2)

In [None]:
edu['Value'].apply(lambda d: d**2)

In [None]:
# add a new column to a DataFrame
edu['ValueNorm'] =

In [None]:
edu[['Value','TIME']].apply(lambda)

In [None]:
# add a new column to a DataFrame
edu['ValueNorm'] = edu['Value'] / edu['Value'].max()
edu.tail()

In [None]:
edu

In [None]:
# remove this column from the DataFrame
# rows(axis=0), columns(axis=1) 
# inplace = False (default), inplace=True (change original DataFrame)
edu.drop('ValueNorm', axis=1, inplace=True)
edu.head()

In [None]:
edu

In [None]:
# insert a new row
# ignore_index=True, otherwise the index 0
edu = edu._append({'TIME': 2000, 'Value': 5.00, 'GEOq': 'a'}, ignore_index=True)
edu.tail()

In [None]:
# remove row(axis=0)
# inplace = False (default), inplace=True (change original DataFrame)
edu.drop(max(edu.index), axis=0, inplace=True)
edu.tail()

In [None]:
# to clear data frame
edu.drop(edu.index, inplace=False)

# Sorting

In [179]:
edu

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes,TIME_100,new
5,8000,European Union (28 countries),Total public expenditure on education as % of ...,:,,20.00,16000
6,8004,European Union (28 countries),Total public expenditure on education as % of ...,:,,20.01,16008
7,8008,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e,20.02,8008
8,8012,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e,20.03,8012
9,8016,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e,20.04,8016
...,...,...,...,...,...,...,...
384,8028,Finland,Total public expenditure on education as % of ...,5.90,,20.07,8028
385,8032,Finland,Total public expenditure on education as % of ...,6.10,,20.08,8032
386,8036,Finland,Total public expenditure on education as % of ...,6.81,,20.09,8036
387,8040,Finland,Total public expenditure on education as % of ...,6.85,,20.10,8040


In [183]:
edu.sort_values(by='TIME', ascending=True, inplace=True)
edu.tail()

Unnamed: 0,TIME,GEO,A,Value,Flag and Footnotes,TIME_100,new
112,8044,Bulgaria,Total public expenditure on education as % of ...,3.82,,20.11,8044
52,8044,Euro area (18 countries),Total public expenditure on education as % of ...,5.15,e,20.11,8044
148,8044,Germany (until 1990 former territory of the FRG),Total public expenditure on education as % of ...,4.98,,20.11,8044
64,8044,Euro area (17 countries),Total public expenditure on education as % of ...,5.15,e,20.11,8044
388,8044,Finland,Total public expenditure on education as % of ...,6.76,,20.11,8044


In [184]:
# to return to the original order, we can sort by an index using the sort_index and axis=0
edu.sort_index(axis=1, ascending=True, inplace=True)
edu.head()

Unnamed: 0,A,Flag and Footnotes,GEO,TIME,TIME_100,Value,new
5,Total public expenditure on education as % of ...,,European Union (28 countries),8000,20.0,:,16000
161,Total public expenditure on education as % of ...,,Ireland,8000,20.0,4.29,8000
377,Total public expenditure on education as % of ...,,Finland,8000,20.0,5.89,8000
245,Total public expenditure on education as % of ...,,Lithuania,8000,20.0,5.63,8000
101,Total public expenditure on education as % of ...,,Bulgaria,8000,20.0,3.88,8000


# Grouping Data

In [None]:
# By “group by” we are referring to a process involving one or more of the following steps:
# 1. Splitting the data into groups based on some criteria
# 2. Applying a function to each group independently
# 3. Combining the results into a data structure

In [185]:
edu

Unnamed: 0,A,Flag and Footnotes,GEO,TIME,TIME_100,Value,new
5,Total public expenditure on education as % of ...,,European Union (28 countries),8000,20.00,:,16000
161,Total public expenditure on education as % of ...,,Ireland,8000,20.00,4.29,8000
377,Total public expenditure on education as % of ...,,Finland,8000,20.00,5.89,8000
245,Total public expenditure on education as % of ...,,Lithuania,8000,20.00,5.63,8000
101,Total public expenditure on education as % of ...,,Bulgaria,8000,20.00,3.88,8000
...,...,...,...,...,...,...,...
112,Total public expenditure on education as % of ...,,Bulgaria,8044,20.11,3.82,8044
52,Total public expenditure on education as % of ...,e,Euro area (18 countries),8044,20.11,5.15,8044
148,Total public expenditure on education as % of ...,,Germany (until 1990 former territory of the FRG),8044,20.11,4.98,8044
64,Total public expenditure on education as % of ...,e,Euro area (17 countries),8044,20.11,5.15,8044


In [202]:
s = list(edu.groupby('GEO'))
s[30][1]

Unnamed: 0,A,Flag and Footnotes,GEO,TIME,TIME_100,Value,new
353,Total public expenditure on education as % of ...,,Slovenia,8000,20.0,:,16000
354,Total public expenditure on education as % of ...,,Slovenia,8004,20.01,5.86,8004
355,Total public expenditure on education as % of ...,,Slovenia,8008,20.02,5.76,8008
356,Total public expenditure on education as % of ...,,Slovenia,8012,20.03,5.80,8012
357,Total public expenditure on education as % of ...,,Slovenia,8016,20.04,5.74,8016
358,Total public expenditure on education as % of ...,,Slovenia,8020,20.05,5.73,8020
359,Total public expenditure on education as % of ...,,Slovenia,8024,20.06,5.72,8024
360,Total public expenditure on education as % of ...,,Slovenia,8028,20.07,5.15,8028
361,Total public expenditure on education as % of ...,,Slovenia,8032,20.08,5.20,8032
362,Total public expenditure on education as % of ...,,Slovenia,8036,20.09,5.69,8036


In [209]:
edu[['GEO','TIME', 'new', 'TIME_100']].groupby('GEO').mean()

Unnamed: 0_level_0,TIME,new,TIME_100
GEO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Austria,8022.0,8022.0,20.055
Belgium,8022.0,8688.666667,20.055
Bulgaria,8022.0,8022.0,20.055
Cyprus,8022.0,8022.0,20.055
Czech Republic,8022.0,8022.0,20.055
Denmark,8022.0,8022.0,20.055
Estonia,8022.0,8022.0,20.055
Euro area (13 countries),8022.0,8688.666667,20.055
Euro area (15 countries),8022.0,8688.666667,20.055
Euro area (17 countries),8022.0,9355.666667,20.055


In [None]:
# like group by in sql
group = edu[['GEO', 'Value']].groupby('GEO').mean()
group.head()

# Merging Data

In [None]:
np.random.randn(10, 4)

In [None]:
df = pd.DataFrame(np.random.randn(10, 4))

In [None]:
s = df[:3]
s

In [None]:
ss = df[3:7]
ss

In [None]:
df[7:]

In [None]:
pd.concat([s, ss], axis=1)

In [None]:
pd.merge(left, right, on="key")

# Resources
- Chapter 2, Introduction to Data Science by Laura Igual and Santi Seguí
    - https://github.com/DataScienceUB/introduction-datascience-python-book 
- pandas Documentation: https://pandas.pydata.org/