# Import Pandas Library

In [1]:
import pandas as pd
import numpy as np

# Data Structures

## First: Series

In [10]:
lst = [1,2,3,4,5] 
s = pd.Series(lst, name="value")
s

0    1
1    2
2    3
3    4
4    5
Name: value, dtype: int64

In [11]:
# change dtype
s = pd.Series(lst, dtype= float)
s

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [12]:
# choose indexes
s = pd.Series(lst, index=['A', 'B', 'C', 'D', 'E' ])
s

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [13]:
index = [10, 12, 13, 14]
names = ['Nourah', 'Sarah', 'Ahmed', 'Lama']
s2 = pd.Series(names, index=index)
s2

10    Nourah
12     Sarah
13     Ahmed
14      Lama
dtype: object

In [36]:
# use dictionary 
names = {10: 'Nourah', 12: 'Sarah', 13: 'Ahmed', 14: 'Lama'}
s3 = pd.Series(names)
s3

10    Nourah
12     Sarah
13     Ahmed
14      Lama
dtype: object

In [22]:
names.keys()

dict_keys([10, 12, 13, 14])

In [37]:
# change the indexes
s3.index = [10, 20, 30, 40]
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
dtype: object

In [38]:
fruit1 = {'Apple': 40 , 'Banana': 50, 'Orange': 60}
ser1 = pd.Series(fruit1)

fruit2 = {'Apple': 30 , 'Strawberry': 20, 'Orange': 20}
ser2 = pd.Series(fruit2)


In [39]:
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
dtype: object

In [40]:
# Selecting
print(s3[20]) # index = 20

Sarah


In [41]:
# Slicing like numpy [start, end(execluded), gap]
# Note that the slice does not use the index labels as references, but the position
s3[:3] # from position 0 to 2

10    Nourah
20     Sarah
30     Ahmed
dtype: object

In [42]:
s3[:-1] # from 0 to last item(execluded)

10    Nourah
20     Sarah
30     Ahmed
dtype: object

In [43]:
# Add elements 
s4 = pd.Series({50: 'Ahmed', 60: 'Nada'})
s3 = s3._append(s4)
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
50     Ahmed
60      Nada
dtype: object

In [50]:
x = pd.Series(['a', 'b'])
y = pd.Series(['c', 'd'])
z = pd.concat([x, y])
z

0    a
1    b
0    c
1    d
dtype: object

In [56]:
z = pd.concat([x, y],ignore_index=True )
z

0    a
1    b
2    c
3    d
dtype: object

In [71]:
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
50     Ahmed
60      Nada
dtype: object

In [74]:
# delete an element 
# s3.drop(60)

x = s3.drop(s3.index[0: 1])
x

20    Sarah
30    Ahmed
40     Lama
50    Ahmed
60     Nada
dtype: object

In [72]:
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
50     Ahmed
60      Nada
dtype: object

In [77]:
# drop duplicate elements
s3 = s3.drop_duplicates()

In [78]:
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
60      Nada
dtype: object

In [79]:
s4 = s.copy()
s4

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [80]:
s4 = s4*3
s4

A     3
B     6
C     9
D    12
E    15
dtype: int64

In [82]:
s

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [83]:
s4.add(s)

A     4
B     8
C    12
D    16
E    20
dtype: int64

In [94]:
s5 = pd.Series({'A': 6, 'B': 8})
s5 = s5.add(s)

In [95]:
s5 # you have to save the result

A     7.0
B    10.0
C     NaN
D     NaN
E     NaN
dtype: float64

In [None]:
s4.sub(s)

In [None]:
s4.mul(s)

In [None]:
s4.div(s)

## Second: DataFrame

### A- Creating a new DataFrame from the scratch

In [102]:
data = {'SalesPerson': ['Kathey', 'Michael', 'William', 'Kathey', 'William', 'Kathey', 'Michael'],
        'Region': ['East', 'West', 'North', 'South', 'North', 'North', 'East'],
        'OrderAmount': [600, 700, 400, 500, 400, 700, 800],
        'Month': ['Jan', 'Feb', 'Feb', 'Mar', 'May', 'Apr', 'May'],
        'isAccepted': [True, False, False, True, True, True, False]
       }

SalesDF = pd.DataFrame(data)
SalesDF[['Region', 'SalesPerson']]

Unnamed: 0,Region,SalesPerson
0,East,Kathey
1,West,Michael
2,North,William
3,South,Kathey
4,North,William
5,North,Kathey
6,East,Michael


In [103]:
data = {'year': [2010, 2011, 2012, 2010, 2011, 2012, 2010, 2011, 2012],
        'team': ['FCBarcelona', 'FCBarcelona', 'FCBarcelona', 'RMadrid', 'RMadrid', 'RMadrid', 'ValenciaCF',
                 'ValenciaCF', 'ValenciaCF'],
        'wins':   [30, 28, 32, 29, 32, 26, 21, 17, 19],
        'draws':  [6, 7, 4, 5, 4, 7, 8, 10, 8],
        'losses': [2, 3, 2, 4, 2, 5, 9, 11, 11]}

football = pd.DataFrame(data)
football   

Unnamed: 0,year,team,wins,draws,losses
0,2010,FCBarcelona,30,6,2
1,2011,FCBarcelona,28,7,3
2,2012,FCBarcelona,32,4,2
3,2010,RMadrid,29,5,4
4,2011,RMadrid,32,4,2
5,2012,RMadrid,26,7,5
6,2010,ValenciaCF,21,8,9
7,2011,ValenciaCF,17,10,11
8,2012,ValenciaCF,19,8,11


### B- Reading tabular data

In [104]:
edu = pd.read_csv('educ_figdp_1_Data.csv')
edu

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e
...,...,...,...,...,...
379,2007,Finland,Total public expenditure on education as % of ...,5.90,
380,2008,Finland,Total public expenditure on education as % of ...,6.10,
381,2009,Finland,Total public expenditure on education as % of ...,6.81,
382,2010,Finland,Total public expenditure on education as % of ...,6.85,


In [106]:
edu.dtypes

TIME                   int64
GEO                   object
INDIC_ED              object
Value                 object
Flag and Footnotes    object
dtype: object

# Viewing Data

In [107]:
edu.head() #first rows that are listed

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e


In [108]:
edu.head(3)

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e


In [117]:
edu.shape

(384, 5)

In [118]:
edu.tail() #last rows that are listed

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
379,2007,Finland,Total public expenditure on education as % of ...,5.9,
380,2008,Finland,Total public expenditure on education as % of ...,6.1,
381,2009,Finland,Total public expenditure on education as % of ...,6.81,
382,2010,Finland,Total public expenditure on education as % of ...,6.85,
383,2011,Finland,Total public expenditure on education as % of ...,6.76,


In [119]:
edu.columns

Index(['TIME', 'GEO', 'INDIC_ED', 'Value', 'Flag and Footnotes'], dtype='object')

In [199]:
edu.columns

Index(['TIME', 'GEO', 'INDIC_ED', 'Value', 'Flag and Footnotes'], dtype='object')

In [113]:
edu.columns[0]

'TIME'

In [122]:
edu.index

RangeIndex(start=0, stop=384, step=1)

In [115]:
edu.values # values of any DataFrame can be retrieved as a Python array by calling its values attribute.

array([[2000, 'European Union (28 countries)',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        ':', nan],
       [2001, 'European Union (28 countries)',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        ':', nan],
       [2002, 'European Union (28 countries)',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '5.00', 'e'],
       ...,
       [2009, 'Finland',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '6.81', nan],
       [2010, 'Finland',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '6.85', nan],
       [2011, 'Finland',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '6.76', nan]], dtype=object)

In [124]:
# quick statistical information
edu.describe()

Unnamed: 0,TIME
count,384.0
mean,2005.5
std,3.456556
min,2000.0
25%,2002.75
50%,2005.5
75%,2008.25
max,2011.0


In [125]:
edu.describe(include=[object])

Unnamed: 0,GEO,INDIC_ED,Value,Flag and Footnotes
count,384,384,384,165
unique,32,1,211,6
top,European Union (28 countries),Total public expenditure on education as % of ...,:,e
freq,12,384,23,70


In [126]:
edu.describe(exclude="number")

Unnamed: 0,GEO,INDIC_ED,Value,Flag and Footnotes
count,384,384,384,165
unique,32,1,211,6
top,European Union (28 countries),Total public expenditure on education as % of ...,:,e
freq,12,384,23,70


In [127]:
edu.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
TIME,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
GEO,European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),...,Finland,Finland,Finland,Finland,Finland,Finland,Finland,Finland,Finland,Finland
INDIC_ED,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...
Value,:,:,5.00,5.03,4.95,4.92,4.91,4.92,5.04,5.38,...,6.22,6.43,6.42,6.30,6.18,5.90,6.10,6.81,6.85,6.76
Flag and Footnotes,,,e,e,e,e,e,e,e,e,...,,,,,,,,,,


# Selection

In [128]:
edu['Value'] # The result will be a Series data structure, not a DataFrame, because only one column is retrieved.

0         :
1         :
2      5.00
3      5.03
4      4.95
       ... 
379    5.90
380    6.10
381    6.81
382    6.85
383    6.76
Name: Value, Length: 384, dtype: object

In [129]:
edu[['Value','GEO']]

Unnamed: 0,Value,GEO
0,:,European Union (28 countries)
1,:,European Union (28 countries)
2,5.00,European Union (28 countries)
3,5.03,European Union (28 countries)
4,4.95,European Union (28 countries)
...,...,...
379,5.90,Finland
380,6.10,Finland
381,6.81,Finland
382,6.85,Finland


In [130]:
edu[10:14] # select a subset of rows from a DataFrame

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
10,2010,European Union (28 countries),Total public expenditure on education as % of ...,5.41,e
11,2011,European Union (28 countries),Total public expenditure on education as % of ...,5.25,e
12,2000,European Union (27 countries),Total public expenditure on education as % of ...,4.91,s
13,2001,European Union (27 countries),Total public expenditure on education as % of ...,4.99,s


In [134]:
edu.loc[90:94, ['TIME', 'GEO']]  #[rows, columns]

Unnamed: 0,TIME,GEO
90,2006,Belgium
91,2007,Belgium
92,2008,Belgium
93,2009,Belgium
94,2010,Belgium


In [132]:
edu.loc[90:94,:] #[rows, columns=all]

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
90,2006,Belgium,Total public expenditure on education as % of ...,5.98,d
91,2007,Belgium,Total public expenditure on education as % of ...,6.0,d
92,2008,Belgium,Total public expenditure on education as % of ...,6.43,d
93,2009,Belgium,Total public expenditure on education as % of ...,6.57,d
94,2010,Belgium,Total public expenditure on education as % of ...,6.58,d


In [138]:
edu.sample(10,random_state=23) # random sample >> 23 seed for random number generator.
# seed makes the random numbers predictable

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
294,2006,Netherlands,Total public expenditure on education as % of ...,5.50,
343,2007,Romania,Total public expenditure on education as % of ...,4.25,
178,2010,Greece,Total public expenditure on education as % of ...,:,
73,2001,Euro area (13 countries),Total public expenditure on education as % of ...,4.97,s
284,2008,Malta,Total public expenditure on education as % of ...,5.72,i
193,2001,France,Total public expenditure on education as % of ...,5.95,
236,2008,Latvia,Total public expenditure on education as % of ...,5.71,
205,2001,Italy,Total public expenditure on education as % of ...,4.83,
59,2011,Euro area (17 countries),Total public expenditure on education as % of ...,5.15,e
252,2000,Luxembourg,Total public expenditure on education as % of ...,:,


# Filtering Data

In [148]:
edu['TIME'] == 2004

0      False
1      False
2      False
3      False
4       True
       ...  
379    False
380    False
381    False
382    False
383    False
Name: TIME, Length: 384, dtype: bool

In [149]:
# Another way of selection
# by applying Boolean indexing. This indexing is commonly known as a filter. 
edu[edu['TIME'] == 2004]

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e
16,2004,European Union (27 countries),Total public expenditure on education as % of ...,4.95,e
28,2004,European Union (25 countries),Total public expenditure on education as % of ...,4.98,e
40,2004,Euro area (18 countries),Total public expenditure on education as % of ...,4.8,e
52,2004,Euro area (17 countries),Total public expenditure on education as % of ...,4.8,e
64,2004,Euro area (15 countries),Total public expenditure on education as % of ...,4.96,e
76,2004,Euro area (13 countries),Total public expenditure on education as % of ...,4.95,e
88,2004,Belgium,Total public expenditure on education as % of ...,5.95,d
100,2004,Bulgaria,Total public expenditure on education as % of ...,4.4,
112,2004,Czech Republic,Total public expenditure on education as % of ...,4.2,


# Filtering Missing and dupliacated Values

In [153]:
edu['Value'].isnull()#.sum()

0      False
1      False
2      False
3      False
4      False
       ...  
379    False
380    False
381    False
382    False
383    False
Name: Value, Length: 384, dtype: bool

In [154]:
edu[edu['Value'].isnull()].head()

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes


In [155]:
edu[edu.duplicated('Value')]

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
7,2007,European Union (28 countries),Total public expenditure on education as % of ...,4.92,e
12,2000,European Union (27 countries),Total public expenditure on education as % of ...,4.91,s
14,2002,European Union (27 countries),Total public expenditure on education as % of ...,5.00,e
15,2003,European Union (27 countries),Total public expenditure on education as % of ...,5.04,e
...,...,...,...,...,...
369,2009,Slovakia,Total public expenditure on education as % of ...,4.09,d
370,2010,Slovakia,Total public expenditure on education as % of ...,4.22,d
375,2003,Finland,Total public expenditure on education as % of ...,6.43,
378,2006,Finland,Total public expenditure on education as % of ...,6.18,


In [156]:
edu.drop_duplicates('Value')

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e
5,2005,European Union (28 countries),Total public expenditure on education as % of ...,4.92,e
...,...,...,...,...,...
377,2005,Finland,Total public expenditure on education as % of ...,6.30,
380,2008,Finland,Total public expenditure on education as % of ...,6.10,
381,2009,Finland,Total public expenditure on education as % of ...,6.81,
382,2010,Finland,Total public expenditure on education as % of ...,6.85,


# Manipulating Data

In [157]:
edu.head()

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e


In [158]:
edu['Value'].head()

0       :
1       :
2    5.00
3    5.03
4    4.95
Name: Value, dtype: object

In [163]:
edu['TIME'] / 1 # you can apply it by one step

0      2000.0
1      2001.0
2      2002.0
3      2003.0
4      2004.0
        ...  
379    2007.0
380    2008.0
381    2009.0
382    2010.0
383    2011.0
Name: TIME, Length: 384, dtype: float64

In [165]:
# we can apply any function to a DataFrame or Series
edu["TIME"].apply(np.sqrt) # sqrt function from the numpy library

0      44.721360
1      44.732538
2      44.743715
3      44.754888
4      44.766059
         ...    
379    44.799554
380    44.810713
381    44.821870
382    44.833024
383    44.844175
Name: TIME, Length: 384, dtype: float64

In [166]:
edu['TIME'].map(np.sqrt)

0      44.721360
1      44.732538
2      44.743715
3      44.754888
4      44.766059
         ...    
379    44.799554
380    44.810713
381    44.821870
382    44.833024
383    44.844175
Name: TIME, Length: 384, dtype: float64

In [168]:
def f2(x):
    return x**2
edu['TIME'].apply(f2)

0      4000000
1      4004001
2      4008004
3      4012009
4      4016016
        ...   
379    4028049
380    4032064
381    4036081
382    4040100
383    4044121
Name: TIME, Length: 384, dtype: int64

In [188]:
edu['TIME'].apply(lambda d: d**2)

0      4000000
1      4004001
2      4008004
3      4012009
4      4016016
        ...   
379    4028049
380    4032064
381    4036081
382    4040100
383    4044121
Name: TIME, Length: 384, dtype: int64

In [None]:
# add a new column to a DataFrame
edu['ValueNorm'] =

In [201]:
edu[['Value','TIME']].apply(lambda)

SyntaxError: invalid syntax (2914487041.py, line 1)

In [200]:
# add a new column to a DataFrame
edu['TimeNorm'] = edu['TIME'] / edu['TIME'].max()
edu.tail()

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes,TimeNorm
378,2006,Finland,Total public expenditure on education as % of ...,6.18,,0.997514
379,2007,Finland,Total public expenditure on education as % of ...,5.9,,0.998011
380,2008,Finland,Total public expenditure on education as % of ...,6.1,,0.998508
381,2009,Finland,Total public expenditure on education as % of ...,6.81,,0.999005
382,2010,Finland,Total public expenditure on education as % of ...,6.85,,0.999503


In [202]:
edu

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes,TimeNorm
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,,0.994530
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,,0.995027
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e,0.995525
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e,0.996022
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e,0.996519
...,...,...,...,...,...,...
378,2006,Finland,Total public expenditure on education as % of ...,6.18,,0.997514
379,2007,Finland,Total public expenditure on education as % of ...,5.90,,0.998011
380,2008,Finland,Total public expenditure on education as % of ...,6.10,,0.998508
381,2009,Finland,Total public expenditure on education as % of ...,6.81,,0.999005


In [203]:
# remove this column from the DataFrame
# rows(axis=0), columns(axis=1) 
# inplace = False (default), inplace=True (change original DataFrame)
edu.drop('TimeNorm', axis=1, inplace=True)
edu.head()

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e


In [204]:
edu

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e
...,...,...,...,...,...
378,2006,Finland,Total public expenditure on education as % of ...,6.18,
379,2007,Finland,Total public expenditure on education as % of ...,5.90,
380,2008,Finland,Total public expenditure on education as % of ...,6.10,
381,2009,Finland,Total public expenditure on education as % of ...,6.81,


In [205]:
# insert a new row
# ignore_index=True, otherwise the index 0
edu = edu.append({'TIME': 2000, 'Value': 5.00, 'GEO': 'a'}, ignore_index=True)
edu.tail()

  edu = edu.append({'TIME': 2000, 'Value': 5.00, 'GEO': 'a'}, ignore_index=True)


Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
379,2007,Finland,Total public expenditure on education as % of ...,5.9,
380,2008,Finland,Total public expenditure on education as % of ...,6.1,
381,2009,Finland,Total public expenditure on education as % of ...,6.81,
382,2010,Finland,Total public expenditure on education as % of ...,6.85,
383,2000,a,,5.0,


In [206]:
# remove row(axis=0)
# inplace = False (default), inplace=True (change original DataFrame)
edu.drop(max(edu.index), axis=0, inplace=True)
edu.tail()

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
378,2006,Finland,Total public expenditure on education as % of ...,6.18,
379,2007,Finland,Total public expenditure on education as % of ...,5.9,
380,2008,Finland,Total public expenditure on education as % of ...,6.1,
381,2009,Finland,Total public expenditure on education as % of ...,6.81,
382,2010,Finland,Total public expenditure on education as % of ...,6.85,


In [207]:
# to clear data frame
edu.drop(edu.index, inplace=False)

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes


# Sorting

In [211]:
edu.sort_values(by='TIME', ascending=False, inplace=True)
edu.head()

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
323,2011,Poland,Total public expenditure on education as % of ...,4.94,
95,2011,Belgium,Total public expenditure on education as % of ...,6.55,d
47,2011,Euro area (18 countries),Total public expenditure on education as % of ...,5.15,e
275,2011,Hungary,Total public expenditure on education as % of ...,4.71,
71,2011,Euro area (15 countries),Total public expenditure on education as % of ...,5.16,e


In [218]:
# to return to the original order, we can sort by an index using the sort_index and axis=0
edu.sort_index(axis=0, ascending=True, inplace=True)
edu.head()

Unnamed: 0,Flag and Footnotes,GEO,INDIC_ED,TIME,Value
0,,European Union (28 countries),Total public expenditure on education as % of ...,2000,:
1,,European Union (28 countries),Total public expenditure on education as % of ...,2001,:
2,e,European Union (28 countries),Total public expenditure on education as % of ...,2002,5.00
3,e,European Union (28 countries),Total public expenditure on education as % of ...,2003,5.03
4,e,European Union (28 countries),Total public expenditure on education as % of ...,2004,4.95


# Grouping Data

In [None]:
# By “group by” we are referring to a process involving one or more of the following steps:
# 1. Splitting the data into groups based on some criteria
# 2. Applying a function to each group independently
# 3. Combining the results into a data structure

In [233]:
edu.groupby('TIME').max()

  edu.groupby('TIME').max()


Unnamed: 0_level_0,GEO,INDIC_ED,Value
TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,Spain,Total public expenditure on education as % of ...,:
2001,Spain,Total public expenditure on education as % of ...,:
2002,Spain,Total public expenditure on education as % of ...,8.44
2003,Spain,Total public expenditure on education as % of ...,8.33
2004,Spain,Total public expenditure on education as % of ...,8.43
2005,Spain,Total public expenditure on education as % of ...,8.30
2006,Spain,Total public expenditure on education as % of ...,:
2007,Spain,Total public expenditure on education as % of ...,:
2008,Spain,Total public expenditure on education as % of ...,:
2009,Spain,Total public expenditure on education as % of ...,:


In [234]:
# like group by in sql

group = edu[['GEO', 'TIME']].groupby('GEO').mean()
group.head()

Unnamed: 0_level_0,TIME
GEO,Unnamed: 1_level_1
Austria,2005.5
Belgium,2005.5
Bulgaria,2005.5
Cyprus,2005.5
Czech Republic,2005.5


# Merging Data

In [None]:
df = pd.DataFrame(np.random.randn(10, 4))

In [None]:
df[:3]

In [None]:
df[3:7]

In [None]:
df[7:]

In [None]:
pd.concat()

In [None]:
pd.merge(left, right, on="key")

# Resources
- Chapter 2, Introduction to Data Science by Laura Igual and Santi Seguí
    - https://github.com/DataScienceUB/introduction-datascience-python-book 
- pandas Documentation: https://pandas.pydata.org/