# An introduction to Pandas


### Datasets

In [1]:
import pandas as pd 
mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pd.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


### Series are just like arrays

In [2]:
a = [1, 7, 2]

myvar = pd.Series(a) 

print(myvar)

0    1
1    7
2    2
dtype: int64


### You can change index of series as you wish  

In [3]:
a = [1, 7, 2]

myvar = pd.Series(a, index = ["x", "y", "z"])

print(myvar)

x    1
y    7
z    2
dtype: int64


In [4]:
print(myvar["y"])

7


# What is a dictionary in python?

### Dictionaries are used to store data values in key:value pairs.
### A dictionary is a collection which is ordered*, changeable and do not allow duplicates.

### As of Python version 3.7, dictionaries are ordered. In Python 3.6 and earlier, dictionaries are unordered.

In [5]:
### Duplicate values will overwrite existing values:

thisdict = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964,
  "year": 2020
}
print(thisdict)

{'brand': 'Ford', 'model': 'Mustang', 'year': 2020}


In [6]:
#Print the number of items in the dictionary:

print(len(thisdict))

3


### There are four collection data types in the Python programming language:

### - List is a collection which is ordered and changeable. Allows duplicate members.
### - Tuple is a collection which is ordered and unchangeable. Allows duplicate members.
### - Set is a collection which is unordered, unchangeable*, and unindexed. No duplicate members.
### - Dictionary is a collection which is ordered** and changeable. No duplicate members.

###  *Set items are unchangeable, but you can remove and/or add items whenever you like.
### **As of Python version 3.7, dictionaries are ordered. In Python 3.6 and earlier, dictionaries are unordered.

### You can also use a key/value object, like a dictionary, when creating a Series.
### The keys of the dictionary become the labels.

In [7]:
thisdict = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}
print(thisdict["brand"])

Ford


### To select only some of the items in the dictionary, use the index argument and specify only the items you want to include in the Series.

In [8]:
calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories, index = ["day1", "day2"])

print(myvar)

day1    420
day2    380
dtype: int64


In [9]:
calories = {"day1": 420, "day2": 380, "day3": 390} #dictionary

myvar = pd.Series(calories)

print(myvar)

day1    420
day2    380
day3    390
dtype: int64


# DataFrames

### Data sets in Pandas are usually multi-dimensional tables, called DataFrames.
### Series is like a column, a DataFrame is the whole table.

In [10]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

myvar = pd.DataFrame(data)

print(myvar)

   calories  duration
0       420        50
1       380        40
2       390        45


In [11]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df) 

   calories  duration
0       420        50
1       380        40
2       390        45


### As you can see from the result above, the DataFrame is like a table with rows and columns.
### Pandas use the loc attribute to return one or more specified row(s)

In [12]:
#refer to the row index:
print(df.loc[0])

calories    420
duration     50
Name: 0, dtype: int64


In [13]:
#use a list of indexes:
print(df.loc[[0, 1]])

   calories  duration
0       420        50
1       380        40


## Named Indexes
### With the index argument, you can name your own indexes.

In [14]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])

print(df) 

      calories  duration
day1       420        50
day2       380        40
day3       390        45


## Locate Named Indexes
### Use the named index in the loc attribute to return the specified row(s).

In [15]:
#refer to the named index:
print(df.loc["day2"])

calories    380
duration     40
Name: day2, dtype: int64


# Load Files Into a DataFrame
### If your data sets are stored in a file, Pandas can load them into a DataFrame.

### Read CSV Files
#### A simple way to store big data sets is to use CSV files (comma separated files).

In [18]:
df = pd.read_csv('data.csv')
# use to_string() to print the entire DataFrame.
print(df.to_string()) 

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

#### If you have a large DataFrame with many rows, Pandas will only return the first 5 rows, and the last 5 rows:

In [19]:

df = pd.read_csv('data.csv')

print(df) 

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
..        ...    ...       ...       ...
164        60    105       140     290.8
165        60    110       145     300.0
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4

[169 rows x 4 columns]


### max_rows
#### The number of rows returned is defined in Pandas option settings.

#### You can check your system's maximum rows with the pd.options.display.max_rows statement.

In [20]:
print(pd.options.display.max_rows) 

60


####  which means that if the DataFrame contains more than 60 rows, the print(df) statement will return only the headers and the first and last 5 rows.
#### You can change the maximum rows number with the same statement.

In [22]:
pd.options.display.max_rows = 9999

df = pd.read_csv('data.csv')

print(df) 

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

### Read JSON
#### Big data sets are often stored, or extracted as JSON.

In [24]:
df = pd.read_json('data.js')

print(df.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.5
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

### JSON = Python Dictionary
### JSON objects have the same format as Python dictionaries.

### If your JSON code is not in a file, but in a Python Dictionary, you can load it into a DataFrame directly:

In [25]:
data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)

print(df) 

   Duration  Pulse  Maxpulse  Calories
0        60    110       130       409
1        60    117       145       479
2        60    103       135       340
3        45    109       175       282
4        45    117       148       406
5        60    102       127       300


## Viewing the Data
#### One of the most used method for getting a quick overview of the DataFrame, is the head() method.
#### The head() method returns the headers and a specified number of rows, starting from the top.

In [27]:
df = pd.read_csv('data.csv')
#Get a quick overview by printing the first 10 rows of the DataFrame:
print(df.head(10))

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0
5        60    102       127     300.0
6        60    110       136     374.0
7        45    104       134     253.3
8        30    109       133     195.1
9        60     98       124     269.0


### if the number of rows is not specified, the head() method will return the top 5 rows.

In [29]:

df = pd.read_csv('data.csv')

print(df.head())

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0


### There is also a tail() method for viewing the last rows of the DataFrame.

### The tail() method returns the headers and a specified number of rows, starting from the bottom.

In [30]:
print(df.tail()) 

     Duration  Pulse  Maxpulse  Calories
164        60    105       140     290.8
165        60    110       145     300.0
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4


## Info About the Data
### The DataFrames object has a method called info(), that gives you more information about the data set.

In [31]:
print(df.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB
None
