#### Transforming data types

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'string_col':['1','2','3','4'],
    'int_float': [1,2,3.2,4.6],
    'mix_col': ['a', '-2', 3, 4],
    'missing': [1,2,3,np.nan],
    'money': ['$1,500.00', '$3,436.45','$958.99', '$1,198.00']
}

df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,string_col,int_float,mix_col,missing,money
0,1,1.0,a,1.0,"$1,500.00"
1,2,2.0,-2,2.0,"$3,436.45"
2,3,3.2,3,3.0,$958.99
3,4,4.6,4,,"$1,198.00"


In [3]:
df.dtypes

string_col     object
int_float     float64
mix_col        object
missing       float64
money          object
dtype: object

In [4]:
df['string_col'] = df.string_col.astype(int)

In [5]:
df.dtypes

string_col      int32
int_float     float64
mix_col        object
missing       float64
money          object
dtype: object

In [6]:
# roundup to 0 decimal values
df['int_float'] = df.int_float.round(0).astype(int) 

In [7]:
df.dtypes

string_col      int32
int_float       int32
mix_col        object
missing       float64
money          object
dtype: object

In [8]:
df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,string_col,int_float,mix_col,missing,money
0,1,1.0,a,1.0,"$1,500.00"
1,2,2.0,-2,2.0,"$3,436.45"
2,3,3.2,3,3.0,$958.99
3,4,4.6,4,,"$1,198.00"


In [9]:
df.int_float.round(0) #<-- Still float

0    1.0
1    2.0
2    3.0
3    5.0
Name: int_float, dtype: float64

In [10]:
df['int_float'] = pd.to_numeric(df['int_float'].round(0), downcast='integer')
df['int_float'].dtypes

dtype('int8')

In [11]:
# if we come across non numeric value errors='coerce' will convert it to NaN value
df['mix_col'] = pd.to_numeric(df['mix_col'], errors='coerce').astype('Int64')
df['mix_col'].dtypes

Int64Dtype()

In [12]:
df.head()

Unnamed: 0,string_col,int_float,mix_col,missing,money
0,1,1,,1.0,"$1,500.00"
1,2,2,-2.0,2.0,"$3,436.45"
2,3,3,3.0,3.0,$958.99
3,4,5,4.0,,"$1,198.00"


In [13]:
df['missing'] = df['missing'].fillna(df['missing'].median())
df['missing']

0    1.0
1    2.0
2    3.0
3    2.0
Name: missing, dtype: float64

In [14]:
df['money'] = df['money'].str.replace('$','').str.replace(',','')
df['money']

  df['money'] = df['money'].str.replace('$','').str.replace(',','')


0    1500.00
1    3436.45
2     958.99
3    1198.00
Name: money, dtype: object

In [15]:
df['money'] = pd.to_numeric(df['money'])
df['money']

0    1500.00
1    3436.45
2     958.99
3    1198.00
Name: money, dtype: float64

In [16]:
df = pd.DataFrame(data=data)
df['money'] = df['money'].str.replace('[\$\,]','', regex=True)
df['money'] = pd.to_numeric(df['money']) #.round(0).astype('int')

In [17]:
df['money']

0    1500.00
1    3436.45
2     958.99
3    1198.00
Name: money, dtype: float64

#### Working with Date time

In [18]:
df = pd.read_csv('https://bit.ly/3oCOgE8')
df.head()

Unnamed: 0,date,p1_sales,p2_sales
0,1/1/2020,39,47
1,1/2/2020,245,314
2,1/3/2020,174,215
3,1/4/2020,58,198
4,1/5/2020,66,77


In [19]:
df.dtypes

date        object
p1_sales    object
p2_sales     int64
dtype: object

In [20]:
df['date'] = pd.to_datetime(df['date'])
df.dtypes                                                                                                                                                   

date        datetime64[ns]
p1_sales            object
p2_sales             int64
dtype: object

In [21]:
dates = pd.DataFrame({'date':['1/7/2020','2/9/2020','3/5/2020'],'col':[1,2,3]})
dates

Unnamed: 0,date,col
0,1/7/2020,1
1,2/9/2020,2
2,3/5/2020,3


##### Pandas by default takes the first value before / to be month mm/dd/yyyy
##### Some countries follow dd/mm/yyyy 

In [22]:
dates['converted'] = pd.to_datetime(df['date'])

In [23]:
dates['converted'].dt.month_name()

0    January
1    January
2    January
Name: converted, dtype: object

In [24]:
df = pd.DataFrame({'date':['1/7/2020','2/9/2020','3/5/2020'],'col':[1,2,3]})
df['converted'] = pd.to_datetime(df['date'], format='%d/%m/%Y')   
df['converted'].dt.month_name()

0         July
1    September
2          May
Name: converted, dtype: object

#### Note:
1. if date is described as 7-3-2020, we use format='%d-%m-%Y' 
2. If date is described as 7-3-20, meaning year has only 2 digits, we use 'y' instead of 'Y'

In [25]:
df['converted'].dt.year

0    2020
1    2020
2    2020
Name: converted, dtype: int64

In [26]:
df['converted'].dt.day

0    1
1    2
2    3
Name: converted, dtype: int64

In [27]:
df['converted'].dt.month

0    7
1    9
2    5
Name: converted, dtype: int64

In [28]:
datetime = pd.DataFrame({'date_time':['1/1/2020 01:30:27','2/2/2020 06:37:27','3/3/2020 09:40:27']})
datetime

Unnamed: 0,date_time
0,1/1/2020 01:30:27
1,2/2/2020 06:37:27
2,3/3/2020 09:40:27


In [29]:
datetime['converted'] = pd.to_datetime(datetime['date_time'], format='%d/%m/%Y %H:%M:%S')

In [30]:
datetime.head()

Unnamed: 0,date_time,converted
0,1/1/2020 01:30:27,2020-01-01 01:30:27
1,2/2/2020 06:37:27,2020-02-02 06:37:27
2,3/3/2020 09:40:27,2020-03-03 09:40:27


In [31]:
df_dates = pd.read_csv('https://bit.ly/3oCOgE8', date_parser='%m/%d/%Y')
df_dates.dtypes

date        object
p1_sales    object
p2_sales     int64
dtype: object

In [32]:
df_dates.head()

Unnamed: 0,date,p1_sales,p2_sales
0,1/1/2020,39,47
1,1/2/2020,245,314
2,1/3/2020,174,215
3,1/4/2020,58,198
4,1/5/2020,66,77
