# Data types

* Some functions and methods are only applicable to certain data types

In [1]:
import pandas as pd

df = pd.read_csv("Data/apple_google_stock_prices_092022.csv")

df.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close,Stock
0,2022-09-01,158.419998,154.669998,156.639999,157.960007,74229900,157.960007,AAPL
1,2022-09-02,160.360001,154.970001,159.75,155.809998,76905200,155.809998,AAPL
2,2022-09-06,157.089996,153.690002,156.470001,154.529999,73714800,154.529999,AAPL
3,2022-09-07,156.669998,153.610001,154.820007,155.960007,87449600,155.960007,AAPL
4,2022-09-08,156.360001,152.679993,154.639999,154.460007,84923800,154.460007,AAPL


## Exercise 1 - dtype method

In [2]:
df.dtypes

Date          object
High         float64
Low          float64
Open         float64
Close        float64
Volume         int64
Adj Close    float64
Stock         object
dtype: object

## Exercise 2 - dtype method

In [3]:
df["Low"].dtypes

dtype('float64')

In [4]:
type(df["Low"].dtypes)

numpy.dtype[float64]

In [5]:
df[["Date", "Low"]].dtypes

Date     object
Low     float64
dtype: object

## Exercise 3 - astype function

* Used for changing the data types of columns

In [6]:
df["Date"] = df["Date"].astype("datetime64[ns]")

In [7]:
df.dtypes

Date         datetime64[ns]
High                float64
Low                 float64
Open                float64
Close               float64
Volume                int64
Adj Close           float64
Stock                object
dtype: object

## Exercise 4 - astype function

* Change the data type of multiple columns at once

In [8]:
df = df.astype(
    {
        "Volume": "float64",
        "Stock": "string"
    }
)

In [9]:
df[["Volume", "Stock"]].dtypes

Volume           float64
Stock     string[python]
dtype: object

## Exercise 5 - select_dtypes function

In [10]:
df.dtypes

Date         datetime64[ns]
High                float64
Low                 float64
Open                float64
Close               float64
Volume              float64
Adj Close           float64
Stock        string[python]
dtype: object

In [11]:
df.select_dtypes(include="float64").head()

Unnamed: 0,High,Low,Open,Close,Volume,Adj Close
0,158.419998,154.669998,156.639999,157.960007,74229900.0,157.960007
1,160.360001,154.970001,159.75,155.809998,76905200.0,155.809998
2,157.089996,153.690002,156.470001,154.529999,73714800.0,154.529999
3,156.669998,153.610001,154.820007,155.960007,87449600.0,155.960007
4,156.360001,152.679993,154.639999,154.460007,84923800.0,154.460007


## Exercise 6 - select_dtypes function

In [12]:
df.select_dtypes(exclude="float64").head()

Unnamed: 0,Date,Stock
0,2022-09-01,AAPL
1,2022-09-02,AAPL
2,2022-09-06,AAPL
3,2022-09-07,AAPL
4,2022-09-08,AAPL


## Exercise 7 - string data type

* Before pandas 1.0, only “object” datatype was used to store strings which cause some drawbacks because non-string data can also be stored using “object” datatype. 
* Pandas 1.0 introduces a new datatype specific to string data which is StringDtype. 
* As of now, we can still use object or StringDtype to store strings but in the future, we may be required to only use StringDtype.

In [13]:
names = pd.Series(["Jane", "Matt", "Emily", "Joe"])

names

0     Jane
1     Matt
2    Emily
3      Joe
dtype: object

In [14]:
names = pd.Series(["Jane", "Matt", "Emily", "Joe"], dtype="string")

names

0     Jane
1     Matt
2    Emily
3      Joe
dtype: string

In [15]:
names = pd.Series(["Jane", "Matt", "Emily", "Joe"], dtype=pd.StringDtype())

names

0     Jane
1     Matt
2    Emily
3      Joe
dtype: string

## Exercise 8 - Int64Dtype

In [16]:
scores = pd.Series([98, 74, 87, None])

scores

0    98.0
1    74.0
2    87.0
3     NaN
dtype: float64

In [17]:
scores = pd.Series([98, 74, 87, None], dtype=pd.Int64Dtype())

scores

0      98
1      74
2      87
3    <NA>
dtype: Int64

## Exercise 9 - categorical data

* Categorical data represents a finite set of discrete values.
    
* So a categorical variable takes on a value from a limited number of values.
    
* Some examples of categorical data are car brands, dress sizes, and colors.
    
* Pandas has the category data type for categorical variables

In [18]:
brands = pd.Series(["Ford", "Toyota", "BMW"], dtype="category")

brands

0      Ford
1    Toyota
2       BMW
dtype: category
Categories (3, object): ['BMW', 'Ford', 'Toyota']

## Exercise 10 - category data type

In [19]:
pd.Categorical(["Ford", "Toyota", "BMW"])

['Ford', 'Toyota', 'BMW']
Categories (3, object): ['BMW', 'Ford', 'Toyota']

In [20]:
pd.Categorical(["Ford", "Toyota", "BMW"], categories=["Ford", "Toyota", "BMW", "Nissan"])

['Ford', 'Toyota', 'BMW']
Categories (4, object): ['Ford', 'Toyota', 'BMW', 'Nissan']

In [21]:
import numpy as np

cars = pd.DataFrame(
    
    {
        "brands": pd.Categorical(["Ford", "Toyota", "BMW"] * 100, categories=["Ford", "Toyota", "BMW"]),
        "price": np.random.randint(10000, 20000, size=300)
    }

)

cars.head()

Unnamed: 0,brands,price
0,Ford,18513
1,Toyota,18164
2,BMW,17669
3,Ford,15041
4,Toyota,17723


In [22]:
cars.dtypes

brands    category
price        int64
dtype: object

## Exercise 11 - category data type

In [23]:
cars.memory_usage()

Index      128
brands     432
price     2400
dtype: int64

In [24]:
cars["brands"] = cars["brands"].astype("string")

cars.memory_usage()

Index      128
brands    2400
price     2400
dtype: int64

In [25]:
cars["brands"] = cars["brands"].astype("object")

cars.memory_usage()

Index      128
brands    2400
price     2400
dtype: int64

## Exercise 12 - category data type

In [26]:
cars = pd.DataFrame(
    
    {
        "brands": pd.Categorical(["Ford", "Toyota", "BMW"] * 100, categories=["Ford", "Toyota", "BMW"]),
        "price": np.random.randint(10000, 20000, size=300)
    }

)

cars.head()

Unnamed: 0,brands,price
0,Ford,10045
1,Toyota,14686
2,BMW,18662
3,Ford,10518
4,Toyota,17678


In [27]:
# cars.iloc[0, 0] = "Nissan"

# output
# TypeError: Cannot setitem on a Categorical with a new category (Nissan), set the categories first

In [28]:
cars = pd.DataFrame(
    
    {
        "brands": pd.Categorical(["Ford", "Toyota", "BMW"] * 100, categories=["Ford", "Toyota", "BMW", "Nissan"]),
        "price": np.random.randint(10000, 20000, size=300)
    }

)

cars.head()

Unnamed: 0,brands,price
0,Ford,18701
1,Toyota,11167
2,BMW,16634
3,Ford,16424
4,Toyota,15190


In [29]:
cars["brands"].value_counts()

brands
Ford      100
Toyota    100
BMW       100
Nissan      0
Name: count, dtype: int64

In [30]:
cars.iloc[0, 0] = "Nissan"

cars["brands"].value_counts()

brands
Toyota    100
BMW       100
Ford       99
Nissan      1
Name: count, dtype: int64

## Exercise 13 - add_categories

* cat: Accessor object for categorical properties of the Series values

In [31]:
cars["brands"] = cars["brands"].cat.add_categories("Tesla")

cars["brands"].value_counts()

brands
Toyota    100
BMW       100
Ford       99
Nissan      1
Tesla       0
Name: count, dtype: int64

## Exercise 14 - infer_objects function

* Attempt to infer better dtypes for object columns.

In [32]:
ser = pd.Series([1, 2, 5, 3, 11, 20], dtype="object")

ser

0     1
1     2
2     5
3     3
4    11
5    20
dtype: object

In [33]:
ser = ser.infer_objects()

ser

0     1
1     2
2     5
3     3
4    11
5    20
dtype: int64

## Exercise 15 - infer_objects function

In [34]:
df = pd.DataFrame(
    
    {
        "col1": [1, 2, 3, 4, 5],
        "col2": ["Jane", "Max", "Matt", "Emily", "John"],
        "col3": [True, True, False, True, False],
        "col4": [1.2, 1.54, 3.2, 1.9, 4.2]
    },
    
    dtype="object"

)

df

Unnamed: 0,col1,col2,col3,col4
0,1,Jane,True,1.2
1,2,Max,True,1.54
2,3,Matt,False,3.2
3,4,Emily,True,1.9
4,5,John,False,4.2


In [35]:
df.dtypes

col1    object
col2    object
col3    object
col4    object
dtype: object

In [36]:
df = df.infer_objects()

df

Unnamed: 0,col1,col2,col3,col4
0,1,Jane,True,1.2
1,2,Max,True,1.54
2,3,Matt,False,3.2
3,4,Emily,True,1.9
4,5,John,False,4.2


In [37]:
df.dtypes

col1      int64
col2     object
col3       bool
col4    float64
dtype: object