# Import Pandas Library

In [1]:
import pandas as pd
import numpy as np

# Data Structures

## First: Series

In [4]:
lst = [1,2,3,4,5] 
s = pd.Series(lst) # shift + tab to show the details
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
# change dtype
s = pd.Series(lst, dtype= float)
s

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [8]:
# choose indexes
s = pd.Series(lst, index=['A', 'B', 'C', 'D', 'E' ])
s

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [10]:
index = [10, 12, 13, 14]
names = ['Nourah', 'Sarah', 'Ahmed', 'Lama']
s2 = pd.Series(names, index=index)
s2

10    Nourah
12     Sarah
13     Ahmed
14      Lama
dtype: object

In [48]:
# use dictionary 
# key as index, value as a value
names = {10: 'Nourah', 12: 'Sarah', 13: 'Ahmed', 14: 'Lama'}
s3 = pd.Series(names)
s3

10    Nourah
12     Sarah
13     Ahmed
14      Lama
dtype: object

In [50]:
pd.Series(data=names.keys(), index=names.values())

Nourah    10
Sarah     12
Ahmed     13
Lama      14
dtype: int64

In [52]:
# change the indexes
s3.index = [10, 20, 30, 40]
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
dtype: object

In [54]:
s3.values

array(['Nourah', 'Sarah', 'Ahmed', 'Lama'], dtype=object)

In [56]:
list(s3.values) # convert to list

['Nourah', 'Sarah', 'Ahmed', 'Lama']

In [58]:
fruit1 = {'Apple': 40 , 'Banana': 50, 'Orange': 60}
ser1 = pd.Series(fruit1)

fruit2 = {'Apple': 30 , 'Strawberry': 20, 'Orange': 20}
ser2 = pd.Series(fruit2)


In [60]:
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
dtype: object

In [62]:
# Selecting
print(s3[20]) # index = 20

Sarah


In [64]:
# Slicing like numpy [start, end(execluded), gap]
# Note that the slice does not use the index labels as references, but the position
s3[:3] # from position 0 to 2

10    Nourah
20     Sarah
30     Ahmed
dtype: object

In [66]:
s3[:-1] # from 0 to last item(execluded)

10    Nourah
20     Sarah
30     Ahmed
dtype: object

In [68]:
# Add elements 
s4 = pd.Series({50: 'Ahmed', 60: 'Nada'})
s3 = s3._append(s4) # we use _ عشان هما ما يبغون نستخدم ذي الفنكشن
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
50     Ahmed
60      Nada
dtype: object

In [84]:
x = pd.Series(['a', 'b'])
y = pd.Series(['c', 'd'])
z = pd.concat([x, y])
z

0    a
1    b
0    c
1    d
dtype: object

In [86]:
z.reset_index(inplace = True, drop=True)

In [88]:
z

0    a
1    b
2    c
3    d
dtype: object

In [90]:
z = pd.concat([x, y],ignore_index=True )
z

0    a
1    b
2    c
3    d
dtype: object

In [80]:
dict(s3)

{10: 'Nourah', 20: 'Sarah', 30: 'Ahmed', 40: 'Lama', 50: 'Ahmed', 60: 'Nada'}

In [92]:
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
50     Ahmed
60      Nada
dtype: object

In [94]:
# delete an element 
s3.drop(60)

10    Nourah
20     Sarah
30     Ahmed
40      Lama
50     Ahmed
dtype: object

In [96]:
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
50     Ahmed
60      Nada
dtype: object

In [98]:
# drop duplicate elements
# drop values
s3.drop_duplicates()

10    Nourah
20     Sarah
30     Ahmed
40      Lama
60      Nada
dtype: object

In [100]:
s

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [102]:
s4 = s.copy()
s4

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [104]:
s4 = s4*3
s4

A     3
B     6
C     9
D    12
E    15
dtype: int64

In [106]:
s4.add(s)

A     4
B     8
C    12
D    16
E    20
dtype: int64

In [112]:
s5 = pd.Series({'A': 6, 'B': 8})
s5 = s5.add(s, fill_value=0)

In [114]:
s5 # you have to save the result

A     7.0
B    10.0
C     3.0
D     4.0
E     5.0
dtype: float64

In [116]:
s4.sub(s)

A     2
B     4
C     6
D     8
E    10
dtype: int64

In [118]:
s4.mul(s)

A     3
B    12
C    27
D    48
E    75
dtype: int64

In [120]:
s4.div(s)

A    3.0
B    3.0
C    3.0
D    3.0
E    3.0
dtype: float64

## Second: DataFrame

### A- Creating a new DataFrame from the scratch

In [122]:
data = {'SalesPerson': ['Kathey', 'Michael', 'William', 'Kathey', 'William', 'Kathey', 'Michael'],
        'Region': ['East', 'West', 'North', 'South', 'North', 'North', 'East'],
        'OrderAmount': [600, 700, 400, 500, 400, 700, 800],
        'Month': ['Jan', 'Feb', 'Feb', 'Mar', 'May', 'Apr', 'May'],
        'isAccepted': [True, False, False, True, True, True, False]
       }

SalesDF = pd.DataFrame(data)
SalesDF   

Unnamed: 0,SalesPerson,Region,OrderAmount,Month,isAccepted
0,Kathey,East,600,Jan,True
1,Michael,West,700,Feb,False
2,William,North,400,Feb,False
3,Kathey,South,500,Mar,True
4,William,North,400,May,True
5,Kathey,North,700,Apr,True
6,Michael,East,800,May,False


In [124]:
data = {'year': [2010, 2011, 2012, 2010, 2011, 2012, 2010, 2011, 2012],
        'team': ['FCBarcelona', 'FCBarcelona', 'FCBarcelona', 'RMadrid', 'RMadrid', 'RMadrid', 'ValenciaCF',
                 'ValenciaCF', 'ValenciaCF'],
        'wins':   [30, 28, 32, 29, 32, 26, 21, 17, 19],
        'draws':  [6, 7, 4, 5, 4, 7, 8, 10, 8],
        'losses': [2, 3, 2, 4, 2, 5, 9, 11, 11]}

football = pd.DataFrame(data)
football   

Unnamed: 0,year,team,wins,draws,losses
0,2010,FCBarcelona,30,6,2
1,2011,FCBarcelona,28,7,3
2,2012,FCBarcelona,32,4,2
3,2010,RMadrid,29,5,4
4,2011,RMadrid,32,4,2
5,2012,RMadrid,26,7,5
6,2010,ValenciaCF,21,8,9
7,2011,ValenciaCF,17,10,11
8,2012,ValenciaCF,19,8,11


### B- Reading tabular data

In [127]:
edu = pd.read_csv('educ_figdp_1_Data.csv')
edu

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e
...,...,...,...,...,...
379,2007,Finland,Total public expenditure on education as % of ...,5.90,
380,2008,Finland,Total public expenditure on education as % of ...,6.10,
381,2009,Finland,Total public expenditure on education as % of ...,6.81,
382,2010,Finland,Total public expenditure on education as % of ...,6.85,


In [129]:
edu.dtypes

TIME                   int64
GEO                   object
INDIC_ED              object
Value                 object
Flag and Footnotes    object
dtype: object

# Viewing Data

In [132]:
edu.head() #first rows that are listed

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e
3,2003,European Union (28 countries),Total public expenditure on education as % of ...,5.03,e
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e


In [134]:
edu.head(3)

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
0,2000,European Union (28 countries),Total public expenditure on education as % of ...,:,
1,2001,European Union (28 countries),Total public expenditure on education as % of ...,:,
2,2002,European Union (28 countries),Total public expenditure on education as % of ...,5.00,e


In [136]:
edu.shape

(384, 5)

In [138]:
edu.tail() #last rows that are listed

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
379,2007,Finland,Total public expenditure on education as % of ...,5.9,
380,2008,Finland,Total public expenditure on education as % of ...,6.1,
381,2009,Finland,Total public expenditure on education as % of ...,6.81,
382,2010,Finland,Total public expenditure on education as % of ...,6.85,
383,2011,Finland,Total public expenditure on education as % of ...,6.76,


In [140]:
edu.columns

Index(['TIME', 'GEO', 'INDIC_ED', 'Value', 'Flag and Footnotes'], dtype='object')

In [144]:
edu.columns 

Index(['TIME', 'GEO', 'INDIC_ED', 'Value', 'Flag and Footnotes'], dtype='object')

In [146]:
edu.columns[0]

'TIME'

In [148]:
edu.index

RangeIndex(start=0, stop=384, step=1)

In [150]:
edu.values # values of any DataFrame can be retrieved as a Python array by calling its values attribute.

array([[2000, 'European Union (28 countries)',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        ':', nan],
       [2001, 'European Union (28 countries)',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        ':', nan],
       [2002, 'European Union (28 countries)',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '5.00', 'e'],
       ...,
       [2009, 'Finland',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '6.81', nan],
       [2010, 'Finland',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '6.85', nan],
       [2011, 'Finland',
        'Total public expenditure on education as % of GDP, for all levels of education combined',
        '6.76', nan]], dtype=object)

In [152]:
# quick statistical information
edu.describe()

Unnamed: 0,TIME
count,384.0
mean,2005.5
std,3.456556
min,2000.0
25%,2002.75
50%,2005.5
75%,2008.25
max,2011.0


In [154]:
edu.describe(include=[object])

Unnamed: 0,GEO,INDIC_ED,Value,Flag and Footnotes
count,384,384,384,165
unique,32,1,211,6
top,European Union (28 countries),Total public expenditure on education as % of ...,:,e
freq,12,384,23,70


In [156]:
edu.describe(exclude="number")

Unnamed: 0,GEO,INDIC_ED,Value,Flag and Footnotes
count,384,384,384,165
unique,32,1,211,6
top,European Union (28 countries),Total public expenditure on education as % of ...,:,e
freq,12,384,23,70


In [158]:
edu.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
TIME,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
GEO,European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),European Union (28 countries),...,Finland,Finland,Finland,Finland,Finland,Finland,Finland,Finland,Finland,Finland
INDIC_ED,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...,Total public expenditure on education as % of ...
Value,:,:,5.00,5.03,4.95,4.92,4.91,4.92,5.04,5.38,...,6.22,6.43,6.42,6.30,6.18,5.90,6.10,6.81,6.85,6.76
Flag and Footnotes,,,e,e,e,e,e,e,e,e,...,,,,,,,,,,


# Selection

In [161]:
edu['Value'] # The result will be a Series data structure, not a DataFrame, because only one column is retrieved.

0         :
1         :
2      5.00
3      5.03
4      4.95
       ... 
379    5.90
380    6.10
381    6.81
382    6.85
383    6.76
Name: Value, Length: 384, dtype: object

In [163]:
edu[['Value','GEO']]

Unnamed: 0,Value,GEO
0,:,European Union (28 countries)
1,:,European Union (28 countries)
2,5.00,European Union (28 countries)
3,5.03,European Union (28 countries)
4,4.95,European Union (28 countries)
...,...,...
379,5.90,Finland
380,6.10,Finland
381,6.81,Finland
382,6.85,Finland


In [165]:
edu[10:14] # select a subset of rows from a DataFrame

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
10,2010,European Union (28 countries),Total public expenditure on education as % of ...,5.41,e
11,2011,European Union (28 countries),Total public expenditure on education as % of ...,5.25,e
12,2000,European Union (27 countries),Total public expenditure on education as % of ...,4.91,s
13,2001,European Union (27 countries),Total public expenditure on education as % of ...,4.99,s


In [169]:
edu.loc[90:94, ['TIME', 'GEO']]  #[rows, columns]

Unnamed: 0,TIME,GEO
90,2006,Belgium
91,2007,Belgium
92,2008,Belgium
93,2009,Belgium
94,2010,Belgium


In [171]:
edu.loc[90:94,:] #[rows, columns=all]

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
90,2006,Belgium,Total public expenditure on education as % of ...,5.98,d
91,2007,Belgium,Total public expenditure on education as % of ...,6.0,d
92,2008,Belgium,Total public expenditure on education as % of ...,6.43,d
93,2009,Belgium,Total public expenditure on education as % of ...,6.57,d
94,2010,Belgium,Total public expenditure on education as % of ...,6.58,d


In [173]:
edu.sample(10,random_state=23) # random sample >> 23 seed for random number generator.
# seed makes the random numbers predictable

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
294,2006,Netherlands,Total public expenditure on education as % of ...,5.50,
343,2007,Romania,Total public expenditure on education as % of ...,4.25,
178,2010,Greece,Total public expenditure on education as % of ...,:,
73,2001,Euro area (13 countries),Total public expenditure on education as % of ...,4.97,s
284,2008,Malta,Total public expenditure on education as % of ...,5.72,i
193,2001,France,Total public expenditure on education as % of ...,5.95,
236,2008,Latvia,Total public expenditure on education as % of ...,5.71,
205,2001,Italy,Total public expenditure on education as % of ...,4.83,
59,2011,Euro area (17 countries),Total public expenditure on education as % of ...,5.15,e
252,2000,Luxembourg,Total public expenditure on education as % of ...,:,


# Filtering Data

In [182]:
edu[edu['TIME'] == 2004]

Unnamed: 0,TIME,GEO,INDIC_ED,Value,Flag and Footnotes
4,2004,European Union (28 countries),Total public expenditure on education as % of ...,4.95,e
16,2004,European Union (27 countries),Total public expenditure on education as % of ...,4.95,e
28,2004,European Union (25 countries),Total public expenditure on education as % of ...,4.98,e
40,2004,Euro area (18 countries),Total public expenditure on education as % of ...,4.8,e
52,2004,Euro area (17 countries),Total public expenditure on education as % of ...,4.8,e
64,2004,Euro area (15 countries),Total public expenditure on education as % of ...,4.96,e
76,2004,Euro area (13 countries),Total public expenditure on education as % of ...,4.95,e
88,2004,Belgium,Total public expenditure on education as % of ...,5.95,d
100,2004,Bulgaria,Total public expenditure on education as % of ...,4.4,
112,2004,Czech Republic,Total public expenditure on education as % of ...,4.2,


In [None]:
# Another way of selection
# by applying Boolean indexing. This indexing is commonly known as a filter. 
edu[edu['Value'] > 6.5]

# Filtering Missing and dupliacated Values

In [None]:
edu['Value'].isnull()#.sum()

In [None]:
edu[edu['Value'].isnull()].head()

In [None]:
edu[edu.duplicated('Value')]

In [None]:
edu.drop_duplicates('Value')

# Manipulating Data

In [None]:
edu.head()

In [None]:
edu['Value'].head()

In [None]:
edu['Value'] / 100 # you can apply it by one step

In [None]:
# we can apply any function to a DataFrame or Series
edu['Value'].apply(np.sqrt) # sqrt function from the numpy library

In [None]:
edu['Value'].map(np.sqrt)

In [None]:
def f2(x):
    return x**2
edu['Value'].apply(f2)

In [None]:
edu['Value'].apply(lambda d: d**2)

In [None]:
# add a new column to a DataFrame
edu['ValueNorm'] =

In [None]:
edu[['Value','TIME']].apply(lambda)

In [None]:
# add a new column to a DataFrame
edu['ValueNorm'] = edu['Value'] / edu['Value'].max()
edu.tail()

In [None]:
edu

In [None]:
# remove this column from the DataFrame
# rows(axis=0), columns(axis=1) 
# inplace = False (default), inplace=True (change original DataFrame)
edu.drop('ValueNorm', axis=1, inplace=True)
edu.head()

In [None]:
edu

In [None]:
# insert a new row
# ignore_index=True, otherwise the index 0
edu = edu.append({'TIME': 2000, 'Value': 5.00, 'GEO': 'a'}, ignore_index=True)
edu.tail()

In [None]:
# remove row(axis=0)
# inplace = False (default), inplace=True (change original DataFrame)
edu.drop(max(edu.index), axis=0, inplace=True)
edu.tail()

In [None]:
# to clear data frame
edu.drop(edu.index, inplace=False)

# Sorting

In [None]:
edu.sort_values(by='Value', ascending=False, inplace=True)
edu.head()

In [None]:
# to return to the original order, we can sort by an index using the sort_index and axis=0
edu.sort_index(axis=0, ascending=True, inplace=True)
edu.head()

# Grouping Data

In [None]:
# By “group by” we are referring to a process involving one or more of the following steps:
# 1. Splitting the data into groups based on some criteria
# 2. Applying a function to each group independently
# 3. Combining the results into a data structure

In [None]:
edu.groupby('GEO').mean()

In [None]:
# like group by in sql
group = edu[['GEO', 'Value']].groupby('GEO').mean()
group.head()

# Merging Data

In [None]:
df = pd.DataFrame(np.random.randn(10, 4))

In [None]:
df[:3]

In [None]:
df[3:7]

In [None]:
df[7:]

In [None]:
pd.concat()

In [None]:
pd.merge(left, right, on="key")

# Resources
- Chapter 2, Introduction to Data Science by Laura Igual and Santi Seguí
    - https://github.com/DataScienceUB/introduction-datascience-python-book 
- pandas Documentation: https://pandas.pydata.org/