# Import Pandas Library

In [1]:
import pandas as pd
import numpy as np

# Data Structures

## First: Series

In [2]:
lst = [1,2,3,4,5] 
s = pd.Series(lst)
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
# change dtype
s = pd.Series(lst, dtype= float)
s

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: float64

In [8]:
# choose indexes
s = pd.Series(lst, index=['A', 'B', 'C', 'D', 'E' ])
s

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [9]:
index = [10, 12, 13, 14]
names = ['Nourah', 'Sarah', 'Ahmed', 'Lama']
s2 = pd.Series(names, index=index)
s2

10    Nourah
12     Sarah
13     Ahmed
14      Lama
dtype: object

In [10]:
# use dictionary 
names = {10: 'Nourah', 12: 'Sarah', 13: 'Ahmed', 14: 'Lama'}
s3 = pd.Series(names)
s3

10    Nourah
12     Sarah
13     Ahmed
14      Lama
dtype: object

In [11]:
# change the indexes
s3.index = [10, 20, 30, 40]
s3

10    Nourah
20     Sarah
30     Ahmed
40      Lama
dtype: object

In [None]:
fruit1 = {'Apple': 40 , 'Banana': 50, 'Orange': 60}
ser1 = pd.Series(fruit1)

fruit2 = {'Apple': 30 , 'Strawberry': 20, 'Orange': 20}
ser2 = pd.Series(fruit2)


In [None]:
s3

In [None]:
# Selecting
print(s3[20]) # index = 20

In [None]:
# Slicing like numpy [start, end(execluded), gap]
# Note that the slice does not use the index labels as references, but the position
s3[:3] # from position 0 to 2

In [None]:
s3[:-1] # from 0 to last item(execluded)

In [None]:
# Add elements 
s4 = pd.Series({50: 'Ahmed', 60: 'Nada'})
s3 = s3.append(s4)
s3

In [None]:
x = pd.Series(['a', 'b'])
y = pd.Series(['c', 'd'])
z = pd.concat([x, y])
z

In [None]:
z = pd.concat([x, y],ignore_index=True )
z

In [None]:
# delete an element 
s3.drop(60)

In [None]:
s3

In [None]:
# drop duplicate elements
s3.drop_duplicates()

In [None]:
s

In [None]:
s4 = s.copy()
s4

In [None]:
s4 = s4*3
s4

In [None]:
s4.add(s)

In [None]:
s5 = pd.Series({'A': 6, 'B': 8})
s5 = s5.add(s)

In [None]:
s5 # you have to save the result

In [None]:
s4.sub(s)

In [None]:
s4.mul(s)

In [None]:
s4.div(s)

## Second: DataFrame

### A- Creating a new DataFrame from the scratch

In [None]:
data = {'SalesPerson': ['Kathey', 'Michael', 'William', 'Kathey', 'William', 'Kathey', 'Michael'],
        'Region': ['East', 'West', 'North', 'South', 'North', 'North', 'East'],
        'OrderAmount': [600, 700, 400, 500, 400, 700, 800],
        'Month': ['Jan', 'Feb', 'Feb', 'Mar', 'May', 'Apr', 'May'],
        'isAccepted': [True, False, False, True, True, True, False]
       }

SalesDF = pd.DataFrame(data)
SalesDF   

In [None]:
data = {'year': [2010, 2011, 2012, 2010, 2011, 2012, 2010, 2011, 2012],
        'team': ['FCBarcelona', 'FCBarcelona', 'FCBarcelona', 'RMadrid', 'RMadrid', 'RMadrid', 'ValenciaCF',
                 'ValenciaCF', 'ValenciaCF'],
        'wins':   [30, 28, 32, 29, 32, 26, 21, 17, 19],
        'draws':  [6, 7, 4, 5, 4, 7, 8, 10, 8],
        'losses': [2, 3, 2, 4, 2, 5, 9, 11, 11]}

football = pd.DataFrame(data)
football   

### B- Reading tabular data

In [None]:
edu = pd.read_csv('educ_figdp_1_Data.csv')
edu

In [None]:
edu.dtypes

# Viewing Data

In [None]:
edu.head() #first rows that are listed

In [None]:
edu.head(3)

In [None]:
edu.shape

In [None]:
edu.tail() #last rows that are listed

In [None]:
edu.columns

In [None]:
edu.columns =

In [None]:
edu.columns[0]

In [None]:
edu.index

In [None]:
edu.values # values of any DataFrame can be retrieved as a Python array by calling its values attribute.

In [None]:
# quick statistical information
edu.describe()

In [None]:
edu.describe(include=[object])

In [None]:
edu.describe(exclude="number")

In [None]:
edu.T

# Selection

In [None]:
edu['Value'] # The result will be a Series data structure, not a DataFrame, because only one column is retrieved.

In [None]:
edu[['Value','GEO']]

In [None]:
edu[10:14] # select a subset of rows from a DataFrame

In [None]:
edu.loc[90:94, ['TIME', 'GEO']]  #[rows, columns]

In [None]:
edu.loc[90:94,:] #[rows, columns=all]

In [None]:
edu.sample(10,random_state=23) # random sample >> 23 seed for random number generator.
# seed makes the random numbers predictable

# Filtering Data

In [None]:
edu['Value'] > 6.5

In [None]:
# Another way of selection
# by applying Boolean indexing. This indexing is commonly known as a filter. 
edu[edu['Value'] > 6.5]

# Filtering Missing and dupliacated Values

In [None]:
edu['Value'].isnull()#.sum()

In [None]:
edu[edu['Value'].isnull()].head()

In [None]:
edu[edu.duplicated('Value')]

In [None]:
edu.drop_duplicates('Value')

# Manipulating Data

In [None]:
edu.head()

In [None]:
edu['Value'].head()

In [None]:
edu['Value'] / 100 # you can apply it by one step

In [None]:
# we can apply any function to a DataFrame or Series
edu['Value'].apply(np.sqrt) # sqrt function from the numpy library

In [None]:
edu['Value'].map(np.sqrt)

In [None]:
def f2(x):
    return x**2
edu['Value'].apply(f2)

In [None]:
edu['Value'].apply(lambda d: d**2)

In [None]:
# add a new column to a DataFrame
edu['ValueNorm'] =

In [None]:
edu[['Value','TIME']].apply(lambda)

In [None]:
# add a new column to a DataFrame
edu['ValueNorm'] = edu['Value'] / edu['Value'].max()
edu.tail()

In [None]:
edu

In [None]:
# remove this column from the DataFrame
# rows(axis=0), columns(axis=1) 
# inplace = False (default), inplace=True (change original DataFrame)
edu.drop('ValueNorm', axis=1, inplace=True)
edu.head()

In [None]:
edu

In [None]:
# insert a new row
# ignore_index=True, otherwise the index 0
edu = edu.append({'TIME': 2000, 'Value': 5.00, 'GEO': 'a'}, ignore_index=True)
edu.tail()

In [None]:
# remove row(axis=0)
# inplace = False (default), inplace=True (change original DataFrame)
edu.drop(max(edu.index), axis=0, inplace=True)
edu.tail()

In [None]:
# to clear data frame
edu.drop(edu.index, inplace=False)

# Sorting

In [None]:
edu.sort_values(by='Value', ascending=False, inplace=True)
edu.head()

In [None]:
# to return to the original order, we can sort by an index using the sort_index and axis=0
edu.sort_index(axis=0, ascending=True, inplace=True)
edu.head()

# Grouping Data

In [None]:
# By “group by” we are referring to a process involving one or more of the following steps:
# 1. Splitting the data into groups based on some criteria
# 2. Applying a function to each group independently
# 3. Combining the results into a data structure

In [None]:
edu.groupby('GEO').mean()

In [None]:
# like group by in sql
group = edu[['GEO', 'Value']].groupby('GEO').mean()
group.head()

# Merging Data

In [None]:
df = pd.DataFrame(np.random.randn(10, 4))

In [None]:
df[:3]

In [None]:
df[3:7]

In [None]:
df[7:]

In [None]:
pd.concat()

In [None]:
pd.merge(left, right, on="key")

# Resources
- Chapter 2, Introduction to Data Science by Laura Igual and Santi Seguí
    - https://github.com/DataScienceUB/introduction-datascience-python-book 
- pandas Documentation: https://pandas.pydata.org/