In [1]:
import numpy as np
import pandas as pd

# pandas Series

### 1.1. Create a Series

Create a Series from a List

In [11]:
# strings are stored as object
data = ['a','b','c',3]
# data = [1,2,3,4]
s = pd.Series(data, index = [i for i in range(4)])
# s.dtypes
s[0:2]

0    a
1    b
dtype: object

Create a Series from an Array

In [None]:
data = np.array(['a','b','c','d'])
s = pd.Series(data)
s

0    a
1    b
2    c
3    d
dtype: object

Create a Series with a custom index

In [None]:
data = np.array(['a','b','c','d'])
s = pd.Series(data, index=[100,101,102,103])
s

100    a
101    b
102    c
103    d
dtype: object

Create a Series from a Dict

In [None]:
data = {'a': 0., 'b': 1., 'c': 2. }
s = pd.Series(data)
s

a    0.0
b    1.0
c    2.0
dtype: float64

Create a Series from a Dict & with a custom index

In [None]:
data = {'a': 0., 'b': 1., 'c': 2. }
s = pd.Series(data, index=['b', 'c', 'd', 'a'])
s

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

### 1.2. Accessing Data from Series

* Accessing Data from Series with Position
    * Data in the series can be accessed similar to that in an ndarray.

* Retrieve Data Using Label (Index)
    * A Series is like a fixed-size dict in that you can get and set values by index label.

#####  Accessing Data from Series with Position

In [None]:
s = pd.Series([20, 12, 23, 43, 35], index=['a','b','c','d','e'])
s

a    20
b    12
c    23
d    43
e    35
dtype: int64

In [None]:
# retrieve the first element
s[0]

20

In [None]:
# retrieve the second element
s[1]

12

In [None]:
# retrieve the first three elements
s[:3]

a    20
b    12
c    23
dtype: int64

In [None]:
# retrieve the last three elements
s[-3:]

c    23
d    43
e    35
dtype: int64

##### Retrieve Data Using Label (Index)

In [None]:
s['a']

20

In [None]:
s[['a','c','d']]

a    20
c    23
d    43
dtype: int64

If a label is not contained, an exception is raised

In [None]:
s['f']

KeyError: ignored

In [None]:
s = pd.Series(['a','b','c','d','e'], 
              index=[2, 1, 3, 4, 5])
s

2    a
1    b
3    c
4    d
5    e
dtype: object

In [None]:
s[2]

'a'

In [None]:
s.iloc[2] # position based

'c'

In [None]:
s.loc[2] # label based

'a'

In [None]:
s.get(key=2) # another way of accessing data label based 

'a'

### 1.3.  `.describe()` function

* Generate descriptive statistics, 
* Descriptive statistics include those that summarize the central tendency, dispersion and shape of a dataset's distribution, excluding `NaN` values.

In [None]:
data = {'James':20, 'Mary':21, 'Robert':18, 'Patricia': 25, 
        'John':22, 'Jennifer':23, 'Michael':27, 
        'Linda':19, 'William':28, 'Elizabeth':24}
people_ages = pd.Series(data)
people_ages

James        20
Mary         21
Robert       18
Patricia     25
John         22
Jennifer     23
Michael      27
Linda        19
William      28
Elizabeth    24
dtype: int64

In [None]:
people_ages.describe()

count    10.000
mean     22.700
std       3.335
min      18.000
25%      20.250
50%      22.500
75%      24.750
max      28.000
dtype: float64

# 2. pandas DataFrame

* Create a DataFrame
* Read/Load a DataFrame from a file
* Write/Save a DataFrame to a file
* Basic Operations

### Create a DataFrame

We can construct a DataFrame using the constructor `pd.DataFrame()`. 

This can be done either from:
* A list of tuples / lists / dictionaries, each as a row
* A dictionary of lists / Series, each as a column
* A Numpy array


#### 1.1. Create a DataFrame from a List of Tuples (or Lists)

Perhaps the most common way to construct a DataFrame is from a list of tuples
* Each tuple represents a data instance and becomes a row
* Tuples should have the same size (number of elements).

In [None]:
lst = [('Tom', 'Reacher', 25), ('Krish', 'Pete', 30), 
       ('Nick', 'Wilson', 26), ('Juli', 'Williams', 22)]
df = pd.DataFrame(lst, columns=['FName', 'LName', 'Age'])
df

Unnamed: 0,FName,LName,Age
0,Tom,Reacher,25
1,Krish,Pete,30
2,Nick,Wilson,26
3,Juli,Williams,22


#### 1.2. Create a DataFrame from a List of Dictionaries

In [None]:
# a List of dictionaries
details = [{'Name':'Rafael', 'Age':23, 'University': 'TUM'},
           {'Name':'Silvia', 'Age':21, 'University': 'LMU'},
           {'Name':'Maria', 'Age':22, 'University': 'TUM'},
           {'Name':'Jan', 'Age':21, 'University': 'TU Berlin'}]

df = pd.DataFrame(details)
df

Unnamed: 0,Name,Age,University
0,Rafael,23,TUM
1,Silvia,21,LMU
2,Maria,22,TUM
3,Jan,21,TU Berlin


In [None]:
# details = [{'Name':'Rafael', 'Age':23, 'University': 'TUM'},
#            {'Name':'Silvia', 'Age':21, 'University': 'LMU'},
#            {'Name':'Maria', 'Age':22, 'University': 'TUM'},
#            {'Name':'Jan', 'Age':21, 'University': 'TU Berlin'}]
details = {'Name':['Rafael','Silvia','Maria','Jan'], 'Age':[23,24,25,18],
           'University': ['TUM', 'TU Berlin', 'LMU','TUH']}

df = pd.DataFrame(details, columns=['Name', 'University', 'Age'])
df

Unnamed: 0,Name,University,Age
0,Rafael,TUM,23
1,Silvia,TU Berlin,24
2,Maria,LMU,25
3,Jan,TUH,18


#### 2.1. Create a DataFrame from a Dictionary

In [None]:
# dictionary with list object in values
details = {
  'Name': ['Rafael', 'Silvia', 'Maria', 'Jan'],
  'Age': [23, 21, 22, 21],
  'University': ['TUM', 'LMU', 'TUM', 'TU Berlin']
}

# creating a DataFrame object 
df = pd.DataFrame(details)
df

In this case, the index will be a 0-based enumeration of rows. 

#### 2.2. Create a DataFrame from a Dictionary of Series

Dictionary of Series can be passed to create a DataFrame.

The result index is the union of all the series indexes passed

In [None]:
d = {'one': pd.Series([1,2,3], index=['a','b','c']),
     'two': pd.Series([1,2,3,4], index=['a','b','c','d'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


Observe, for the series`one`, there is no label `d` passed,
but in the result, the `d` label is appended and associated with NaN.

#### 3. Create a DataFrame from a Numpy Array

In [None]:
a = np.arange(20).reshape(5,4)
df = pd.DataFrame(a, columns=['w','x','y','z'])
df

Unnamed: 0,w,x,y,z
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


### Exercise

If we have multiple 1-dimensional lists, how can we create a DataFrame??

In [None]:
lst1 = ['Germany', 'France', 'Greece', 'Italy', 'Portugal', 'Spain']
lst2 = ['Berlin', 'Paris', 'Athene', 'Rome', 'Lisbon', 'Madrid']
lst3 = [83.7, 65.3, 10.4, 60.5, 10.2, 46.7]

**Solution 1**. Make a list of tuples using `zip()`

In [None]:
data = list(zip(lst1, lst2, lst3))
data

[('Germany', 'Berlin', 83.7),
 ('France', 'Paris', 65.3),
 ('Greece', 'Athene', 10.4),
 ('Italy', 'Rome', 60.5),
 ('Portugal', 'Lisbon', 10.2),
 ('Spain', 'Madrid', 46.7)]

In [None]:
df= pd.DataFrame(data, columns=['Country', 'Capital', 'Population'])
df

Unnamed: 0,Country,Capital,Population
0,Germany,Berlin,83.7
1,France,Paris,65.3
2,Greece,Athene,10.4
3,Italy,Rome,60.5
4,Portugal,Lisbon,10.2
5,Spain,Madrid,46.7


**Solution 2**. Arrange the lists in a dictionary: keys are column names

In [None]:
dic = {'Country':lst1, 'Capital':lst2, 'Population': lst3}
df = pd.DataFrame(dic)
df

Unnamed: 0,Country,Capital,Population
0,Germany,Berlin,83.7
1,France,Paris,65.3
2,Greece,Athene,10.4
3,Italy,Rome,60.5
4,Portugal,Lisbon,10.2
5,Spain,Madrid,46.7


### 2.2. Read/Load a DataFrame from a file

In [None]:
df = pd.read_csv('mydatafile.csv')
df

Unnamed: 0,Country,Capital,Population
0,Germany,Berlin,83.7
1,France,Paris,65.3
2,Greece,Athene,10.4
3,Italy,Rome,60.5
4,Portugal,Lisbon,10.2
5,Spain,Madrid,46.7


### 2.3. Write/Save a DataFrame to a file

In [None]:
df.to_csv('mydatafile.csv', index=False)

### 2.4. Basic Operations

* `.shape`
* `head()` and `tail()`
* `.dtypes`
* `info()`
* `.columns`
* `rename()`
* `astype()`

* `to_numpy()`
* transpose `.T`

In [43]:
df = pd.read_csv('mydatafile.csv', header = 0, skiprows = 1)
df

Unnamed: 0,Germany,Berlin,83.7
0,France,Paris,65.3
1,Greece,Athene,10.4
2,Italy,Rome,60.5
3,Portugal,Lisbon,10.2
4,Spain,Madrid,46.7


#### 1. `.shape`

* `.shape` is an attribute of a pandas `Series` and `DataFrame`
    * dimensions (number of rows, number of columns)
* a pandas Series is 1-dim
    * only the number of rows is returned.

In [None]:
df.shape

(6, 3)

This means: the dataframe has 6 rows, and 3 columns

#### 2. `head()` and `tail()`

The function `head(n)` returns the first `n` rows for the object based on position
* It is useful for quickly looking at your data
* By default, `n` is 5 (select the first 5 rows)

In [None]:
df.head(3)

Unnamed: 0,Country,Capital,Population
0,Germany,Berlin,83.7
1,France,Paris,65.3
2,Greece,Athene,10.4


Similarly, the function `tail(n)` returns the last `n` rows for the object
* By default, `n` is 5 (select the first 5 rows)

In [None]:
df.tail(3)

Unnamed: 0,Country,Capital,Population
3,Italy,Rome,60.5
4,Portugal,Lisbon,10.2
5,Spain,Madrid,46.7


#### 3. `.dtypes`

* This returns a Series with the data type of each column.
* The result's index is the original DataFrame's columns.
* Columns with mixed types are stored with the `object` dtype.

In [None]:
df.Country.dtypes

dtype('O')

#### 4. `info()`

* The function `info()` prints a concise summary of a DataFrame.
* This method prints information about a DataFrame including:
    * index dtype
    * column dtypes
    * non-null values
    * memory usage

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Country     6 non-null      object 
 1   Capital     6 non-null      object 
 2   Population  6 non-null      float64
dtypes: float64(1), object(2)
memory usage: 272.0+ bytes


#### 5. `.columns`

The attribute `.columns` returns a list of the dataframe columns:

In [None]:
df.columns

Index(['Country', 'Capital', 'Population'], dtype='object')

We can use this attribute to change the column names:

In [None]:
df.columns = ['col1', 'col2', 'col3']
df

Unnamed: 0,col1,col2,col3
0,Germany,Berlin,83.7
1,France,Paris,65.3
2,Greece,Athene,10.4
3,Italy,Rome,60.5
4,Portugal,Lisbon,10.2
5,Spain,Madrid,46.7


In [None]:
# reload the original dataframe 
df = pd.read_csv('mydatafile.csv')

#### 6. `rename()`

We can also use the function `rename()` to rename the columns

In [None]:
df.rename(
   columns={
       'Country':'col1',
       'Capital':'col2',
       'Population':'col3'
   }, inplace=True)
df

Unnamed: 0,col1,col2,col3
0,Germany,Berlin,83.7
1,France,Paris,65.3
2,Greece,Athene,10.4
3,Italy,Rome,60.5
4,Portugal,Lisbon,10.2
5,Spain,Madrid,46.7


* The `rename()` function can be used for both row labels and column labels.
* Provide a dictionary with the keys the current names and the values the new names to update the corresponding names.

The mapping should not be restricted to fixed names only, but can be a mapping function as well.

For example, converting the column names to uppercase/lowercase letters:

In [None]:
df.rename(columns=str.upper)

Unnamed: 0,COL1,COL2,COL3
0,Germany,Berlin,83.7
1,France,Paris,65.3
2,Greece,Athene,10.4
3,Italy,Rome,60.5
4,Portugal,Lisbon,10.2
5,Spain,Madrid,46.7


#### 7. `astype()`

In [None]:
df = pd.DataFrame([(2.0, ' 3'), (4.0, '6'), (0.0, '12')], columns=['a','b'])
df

Unnamed: 0,a,b
0,2.0,3
1,4.0,6
2,0.0,12


In [None]:
df.dtypes

a    float64
b     object
dtype: object

In [None]:
df = df.astype(int)
df

Unnamed: 0,a,b
0,2,3
1,4,6
2,0,12


In [None]:
df.dtypes

a    int64
b    int64
dtype: object

In [None]:
df = pd.DataFrame([(2, ' 3'), (4, '6'), (0, '12')], columns=['a','b'])
df.dtypes

a     int64
b    object
dtype: object

In [None]:
df = df.astype({'a': float, 'b':int})
df.dtypes

a    float64
b      int64
dtype: object

#### 8. `to_numpy()`

In [None]:
df.to_numpy()

array([[ 2.,  3.],
       [ 4.,  6.],
       [ 0., 12.]])

In [16]:
a=[1,2,3]
b=[4,5]
c= zip(b,a)
for i in c:
  print(i)

(4, 1)
(5, 2)


In [86]:
a=np.array([1,2,3])
b=np.array([4,5])

c= pd.DataFrame([a,b], columns=['a','b','c'])
c

Unnamed: 0,a,b,c
0,1,2,3.0
1,4,5,


In [87]:
print(c.dtypes)

a      int64
b      int64
c    float64
dtype: object


In [88]:
c = c.astype(float)

In [89]:
c.dtypes

a    float64
b    float64
c    float64
dtype: object

In [90]:
c=c.astype({'a':int,'b': str})

In [91]:
c.dtypes

a      int64
b     object
c    float64
dtype: object

In [92]:
c['a'].astype(float)

0    1.0
1    4.0
Name: a, dtype: float64

In [93]:
c.dtypes

a      int64
b     object
c    float64
dtype: object