In [1]:
# Importing pandas and numpy libraries
import pandas as pd
import numpy as np

# Defining columns
column_names = ["Customer ID", "Customer Name", "2018 Revenue", "2019 Revenue", "Growth", "Start Year", "Start Month", "Start Day", "New Customer"]

# Defining lists
row1 = list([1001.0, 'Pandas Banking', '€235000', '€248000', '5.5%', 2013,3,10, 0])
row2 = list([1002.0, 'Pandas Grocery', '€196000', '€205000', '4.5%', 2016,4,30, 0])
row3 = list([1003.0, 'Pandas Telecom', '€167000', '€193000', '15.5%', 2010,11,24, 0])
row4 = list([1004.0, 'Pandas Transport', '€79000', '€90000', '13.9%', 2018,1,15, 1])
row5 = list([1005.0, 'Pandas Insurance', '€241000', '€264000', '9.5%', 2009,6,1, 0])

# Defining a DataFrame
data_frame = pd.DataFrame(data=[row1, row2, row3, row4, row5], columns=column_names)

# Display DataFrame values
data_frame

Unnamed: 0,Customer ID,Customer Name,2018 Revenue,2019 Revenue,Growth,Start Year,Start Month,Start Day,New Customer
0,1001.0,Pandas Banking,€235000,€248000,5.5%,2013,3,10,0
1,1002.0,Pandas Grocery,€196000,€205000,4.5%,2016,4,30,0
2,1003.0,Pandas Telecom,€167000,€193000,15.5%,2010,11,24,0
3,1004.0,Pandas Transport,€79000,€90000,13.9%,2018,1,15,1
4,1005.0,Pandas Insurance,€241000,€264000,9.5%,2009,6,1,0


In [2]:
# Adding revenue together
data_frame['2018 Revenue'] + data_frame['2019 Revenue']

0    €235000€248000
1    €196000€205000
2    €167000€193000
3      €79000€90000
4    €241000€264000
dtype: object

In [3]:
# Display DataFrame dtypes
data_frame.dtypes

Customer ID      float64
Customer Name     object
2018 Revenue      object
2019 Revenue      object
Growth            object
Start Year         int64
Start Month        int64
Start Day          int64
New Customer       int64
dtype: object

In [4]:
# Display DataFrame info
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Customer ID    5 non-null      float64
 1   Customer Name  5 non-null      object 
 2   2018 Revenue   5 non-null      object 
 3   2019 Revenue   5 non-null      object 
 4   Growth         5 non-null      object 
 5   Start Year     5 non-null      int64  
 6   Start Month    5 non-null      int64  
 7   Start Day      5 non-null      int64  
 8   New Customer   5 non-null      int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 488.0+ bytes


In [5]:
# Convert "Customer ID" into a int
data_frame["Customer ID"] = data_frame['Customer ID'].astype('int')
data_frame["Customer ID"]

0    1001
1    1002
2    1003
3    1004
4    1005
Name: Customer ID, dtype: int64

In [6]:
# Convert "2018 Revenue" into a int
data_frame['2018 Revenue'] = data_frame['2018 Revenue'].astype('int')

ValueError: invalid literal for int() with base 10: '€235000'

In [7]:
# Create function to remove '€'
def remove_currency(column):
    new_column = column.replace('€', '')
    return int(new_column)

In [8]:
# Convert "2018 Revenue" into a int
data_frame['2018 Revenue'] = data_frame['2018 Revenue'].apply(remove_currency)
data_frame["2018 Revenue"]

0    235000
1    196000
2    167000
3     79000
4    241000
Name: 2018 Revenue, dtype: int64

In [9]:
# Convert "2019 Revenue" into a int
data_frame['2019 Revenue'] = data_frame['2019 Revenue'].apply(remove_currency)
data_frame["2019 Revenue"]

0    248000
1    205000
2    193000
3     90000
4    264000
Name: 2019 Revenue, dtype: int64

In [10]:
# Create function to remove '%'
def remove_percentage(column):
    new_column = column.replace('%', '')
    return float(new_column)

# Convert "Growth" into a float
data_frame['Growth'] = data_frame['Growth'].apply(remove_percentage)
data_frame["Growth"]

0     5.5
1     4.5
2    15.5
3    13.9
4     9.5
Name: Growth, dtype: float64

In [11]:
# Rename the columns
data_frame.rename(columns={'Start Year': 'year', 'Start Month': 'month', 'Start Day': 'day'}, inplace=True)

# Convert the 3 columns into a datetime:
data_frame['Starting Date'] = pd.to_datetime(data_frame[['day', 'month', 'year']])
data_frame['Starting Date']

0   2013-03-10
1   2016-04-30
2   2010-11-24
3   2018-01-15
4   2009-06-01
Name: Starting Date, dtype: datetime64[ns]

In [12]:
# Convert "New Customer" into a bool
data_frame["New Customer"] = data_frame['New Customer'].astype('bool')
data_frame["New Customer"]

0    False
1    False
2    False
3     True
4    False
Name: New Customer, dtype: bool

In [13]:
# Convert "Customer Name" into a category
data_frame["Customer Name"] = data_frame['Customer Name'].astype('category')
data_frame["Customer Name"]

0      Pandas Banking
1      Pandas Grocery
2      Pandas Telecom
3    Pandas Transport
4    Pandas Insurance
Name: Customer Name, dtype: category
Categories (5, object): [Pandas Banking, Pandas Grocery, Pandas Insurance, Pandas Telecom, Pandas Transport]

In [14]:
# Display DataFrame dtypes
data_frame.dtypes

Customer ID               int64
Customer Name          category
2018 Revenue              int64
2019 Revenue              int64
Growth                  float64
year                      int64
month                     int64
day                       int64
New Customer               bool
Starting Date    datetime64[ns]
dtype: object

In [15]:
# Adding revenue together
data_frame['2018 Revenue'] + data_frame['2019 Revenue']

0    483000
1    401000
2    360000
3    169000
4    505000
dtype: int64

In [16]:
# Difference between 'Starting Date' and '2020-09-01'
data_frame['Starting Date'] - pd.to_datetime('2020-09-01')

0   -2732 days
1   -1585 days
2   -3569 days
3    -960 days
4   -4110 days
Name: Starting Date, dtype: timedelta64[ns]

In [17]:
# Remove 'year','month','day'
data_frame.drop(['year','month','day'], axis = 1, inplace=True)

# Add null values
data_frame.iloc[0,0] = None
data_frame.iloc[4,1] = None
data_frame.iloc[2,2] = None
data_frame.iloc[3,3] = None
data_frame.iloc[3,4] = None
data_frame.iloc[1,5] = None
data_frame.iloc[2,6] = None

# Display the Dataframe
data_frame

Unnamed: 0,Customer ID,Customer Name,2018 Revenue,2019 Revenue,Growth,New Customer,Starting Date
0,,Pandas Banking,235000.0,248000.0,5.5,0.0,2013-03-10
1,1002.0,Pandas Grocery,196000.0,205000.0,4.5,,2016-04-30
2,1003.0,Pandas Telecom,,193000.0,15.5,0.0,NaT
3,1004.0,Pandas Transport,79000.0,,,1.0,2018-01-15
4,1005.0,,241000.0,264000.0,9.5,0.0,2009-06-01


In [18]:
# Display DataFrame info
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Customer ID    4 non-null      float64       
 1   Customer Name  4 non-null      category      
 2   2018 Revenue   4 non-null      float64       
 3   2019 Revenue   4 non-null      float64       
 4   Growth         4 non-null      float64       
 5   New Customer   4 non-null      float64       
 6   Starting Date  4 non-null      datetime64[ns]
dtypes: category(1), datetime64[ns](1), float64(5)
memory usage: 573.0 bytes


In [19]:
# Convert "Customer ID" into a int
data_frame["Customer ID"] = data_frame['Customer ID'].astype('int')
data_frame["Customer ID"]

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [20]:
# Convert "Customer Name" into a object
data_frame["Customer Name"] = data_frame['Customer Name'].astype('object')
data_frame["Customer Name"]

0      Pandas Banking
1      Pandas Grocery
2      Pandas Telecom
3    Pandas Transport
4                 NaN
Name: Customer Name, dtype: object

In [21]:
# Convert "New Customer" into a bool
data_frame["New Customer"] = data_frame['New Customer'].astype('bool')
data_frame["New Customer"]

0    False
1     True
2    False
3     True
4    False
Name: New Customer, dtype: bool

In [22]:
# Difference between 'Starting Date' and '2020-09-01'
data_frame['Starting Date'] - pd.to_datetime('2020-09-01')

0   -2732 days
1   -1585 days
2          NaT
3    -960 days
4   -4110 days
Name: Starting Date, dtype: timedelta64[ns]

In [23]:
# Importing pandas and numpy libraries
import pandas as pd
import numpy as np

# Defining columns
column_names = ["Customer ID", "Customer Name", "2018 Revenue", "2019 Revenue", "Growth", "Start Year", "Start Month", "Start Day", "New Customer"]

# Defining lists
row1 = list([1001.0, 'Pandas Banking', '235000', '248000', '5.5', 2013,3,10, 0])
row2 = list([1002.0, 'Pandas Grocery', '196000', '205000', '4.5', 2016,4,30, 0])
row3 = list([1003.0, 'Pandas Telecom', '167000', '193000', '15.5', 2010,11,24, 0])
row4 = list([1004.0, 'Pandas Transport', '79000', '90000', '13.9', 2018,1,15, 1])
row5 = list([1005.0, 'Pandas Insurance', '241000', '264000', '9.5', 2009,6,1, 0])

# Defining a DataFrame
data_frame = pd.DataFrame(data=[row1, row2, row3, row4, row5], columns=column_names)

# Display DataFrame values
data_frame

Unnamed: 0,Customer ID,Customer Name,2018 Revenue,2019 Revenue,Growth,Start Year,Start Month,Start Day,New Customer
0,1001.0,Pandas Banking,235000,248000,5.5,2013,3,10,0
1,1002.0,Pandas Grocery,196000,205000,4.5,2016,4,30,0
2,1003.0,Pandas Telecom,167000,193000,15.5,2010,11,24,0
3,1004.0,Pandas Transport,79000,90000,13.9,2018,1,15,1
4,1005.0,Pandas Insurance,241000,264000,9.5,2009,6,1,0


In [24]:
# Display DataFrame dtypes
data_frame.dtypes

Customer ID      float64
Customer Name     object
2018 Revenue      object
2019 Revenue      object
Growth            object
Start Year         int64
Start Month        int64
Start Day          int64
New Customer       int64
dtype: object

In [25]:
# select_dtypes() object
data_frame.select_dtypes('object')

Unnamed: 0,Customer Name,2018 Revenue,2019 Revenue,Growth
0,Pandas Banking,235000,248000,5.5
1,Pandas Grocery,196000,205000,4.5
2,Pandas Telecom,167000,193000,15.5
3,Pandas Transport,79000,90000,13.9
4,Pandas Insurance,241000,264000,9.5


In [26]:
# select_dtypes() object and number but excluding int64
data_frame.select_dtypes(['number','object'], exclude='int64')

Unnamed: 0,Customer ID,Customer Name,2018 Revenue,2019 Revenue,Growth
0,1001.0,Pandas Banking,235000,248000,5.5
1,1002.0,Pandas Grocery,196000,205000,4.5
2,1003.0,Pandas Telecom,167000,193000,15.5
3,1004.0,Pandas Transport,79000,90000,13.9
4,1005.0,Pandas Insurance,241000,264000,9.5
