**Data Cleaning**

Dirty data could include the following: missing values, duplicate values, and data in wrong format

In [1]:
import pandas as pd
import numpy as np

In [2]:
# importing the data
d = pd.read_csv("data_2.csv")

In [3]:
d.head()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0


In [4]:
d

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [5]:
d.shape

(32, 5)

In [6]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB


In [7]:
d.describe()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
count,32.0,32.0,32.0,30.0
mean,68.4375,103.5,128.5,304.68
std,70.039591,7.832933,12.998759,66.003779
min,30.0,90.0,101.0,195.1
25%,60.0,100.0,120.0,250.7
50%,60.0,102.5,127.5,291.2
75%,60.0,106.5,132.25,343.975
max,450.0,130.0,175.0,479.0


In [8]:
print(d.columns)

Index(['Duration', 'Date', 'Pulse', 'Maxpulse', 'Calories'], dtype='object')


In [9]:
# checking for null values
d.isnull().any()

Duration    False
Date         True
Pulse       False
Maxpulse    False
Calories     True
dtype: bool

In [10]:
# checking the number of null values
d.isnull().sum()

Duration    0
Date        1
Pulse       0
Maxpulse    0
Calories    2
dtype: int64

In [11]:
d["Date"].isnull().any()

np.True_

In [12]:
d["Date"].isnull()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22     True
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
Name: Date, dtype: bool

In [13]:
d[d["Date"].isnull()]

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
22,45,,100,119,282.0


In [14]:
d["Calories"].isnull().any()

np.True_

In [15]:
d["Calories"].isnull()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18     True
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28     True
29    False
30    False
31    False
Name: Calories, dtype: bool

In [16]:
d[d["Calories"].isnull()]

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
18,45,'2020/12/18',90,112,
28,60,'2020/12/28',103,132,


In [17]:
# dropping null values (.reset_index())
data = d.dropna()

In [18]:
data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [19]:
# d.fillna(10)  Not ideal, be specific with the column you want to fill it with

In [20]:
# d["Calories"].fillna(130)

In [21]:
# To replace with mean, median or mode

In [22]:
# d["Calories"].mean()

In [23]:
# d["Calories"].median()

In [24]:
# d["Calories"].mode() # returns all modes in a situation where there are multiple modes

In [25]:
# d["Calories"].mode() #returns the first mode from the list of all modes

In [26]:
# working on wront format
# To fix that, either delete the rows, or convert all cells in the column into the same format.

In [27]:
data.shape

(29, 5)

In [28]:
# To convert the date column to date
data["Date"].info()

<class 'pandas.core.series.Series'>
Index: 29 entries, 0 to 31
Series name: Date
Non-Null Count  Dtype 
--------------  ----- 
29 non-null     object
dtypes: object(1)
memory usage: 464.0+ bytes


In [29]:
data["Date"].head(10)

0         '2020/12/01' 
1          '2020/12/02'
2         '2020/12/03' 
3          '2020/12/04'
4          '2020/12/05'
5         '2020/12/06' 
6          '2020/12/07'
7      '2020/12/08'    
8      '2020/12/09'    
9     '2020/12/10'     
Name: Date, dtype: object

In [30]:
data["Date"] = data["Date"].str.replace("/", "-")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Date"] = data["Date"].str.replace("/", "-")


In [31]:
data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020-12-01',110,130,409.1
1,60,'2020-12-02',117,145,479.0
2,60,'2020-12-03',103,135,340.0
3,45,'2020-12-04',109,175,282.4
4,45,'2020-12-05',117,148,406.0
5,60,'2020-12-06',102,127,300.0
6,60,'2020-12-07',110,136,374.0
7,450,'2020-12-08',104,134,253.3
8,30,'2020-12-09',109,133,195.1
9,60,'2020-12-10',98,124,269.0


In [32]:
data["Date"] = data["Date"].str.strip(" ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Date"] = data["Date"].str.strip(" ")


In [33]:
data["Date"] = data["Date"].str.replace("-", "")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Date"] = data["Date"].str.replace("-", "")


In [34]:
data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'20201201',110,130,409.1
1,60,'20201202',117,145,479.0
2,60,'20201203',103,135,340.0
3,45,'20201204',109,175,282.4
4,45,'20201205',117,148,406.0
5,60,'20201206',102,127,300.0
6,60,'20201207',110,136,374.0
7,450,'20201208',104,134,253.3
8,30,'20201209',109,133,195.1
9,60,'20201210',98,124,269.0


In [35]:
data["Date"].head(10)

0    '20201201'
1    '20201202'
2    '20201203'
3    '20201204'
4    '20201205'
5    '20201206'
6    '20201207'
7    '20201208'
8    '20201209'
9    '20201210'
Name: Date, dtype: object

In [59]:
data.loc[26].drop(columns = "Date", inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[26].drop(columns = "Date", inplace = True)


In [60]:
data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,45,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


In [39]:
data["Date"] = pd.to_datetime(data["Date"], format = "mixed")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Date"] = pd.to_datetime(data["Date"], format = "mixed")


In [40]:
data["Date"].info()

<class 'pandas.core.series.Series'>
Index: 29 entries, 0 to 31
Series name: Date
Non-Null Count  Dtype         
--------------  -----         
29 non-null     datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 1.5 KB


In [41]:
data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,450,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


In [42]:
# row 22 isn't a date thereby giving us NaT
# d.dropna(subset = ["Date"]) # to remove the row

In [43]:
# to insert the actual date on row 22 either by using "loc" or "at"to call out the index
# d.loc[22, "Date"] = pd.to_datetime(2020-12-22)

In [44]:
# to insert the actual date on row 22 either by using "loc" or "at"to call out the index
# d.at[22, "Date"] = pd.to_datetime(2020-12-22)

In [45]:
data.at[7, "Duration"] = 45

In [46]:
data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,45,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


In [47]:
# Removing duplicates
data.duplicated().any()

np.True_

In [48]:
data.duplicated().sum()

np.int64(1)

In [49]:
data[data.duplicated()]

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
12,60,2020-12-12,100,120,250.7


In [50]:
data = data.drop_duplicates()

In [51]:
data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,45,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


In [52]:
# axis parameters
# axis = 0 index(rows) by default
# axis = 1 columns

In [53]:
data["dtion"] = data["Pulse"] + data["Maxpulse"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["dtion"] = data["Pulse"] + data["Maxpulse"]


In [54]:
data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories,dtion
0,60,2020-12-01,110,130,409.1,240
1,60,2020-12-02,117,145,479.0,262
2,60,2020-12-03,103,135,340.0,238
3,45,2020-12-04,109,175,282.4,284
4,45,2020-12-05,117,148,406.0,265
5,60,2020-12-06,102,127,300.0,229
6,60,2020-12-07,110,136,374.0,246
7,45,2020-12-08,104,134,253.3,238
8,30,2020-12-09,109,133,195.1,242
9,60,2020-12-10,98,124,269.0,222


In [55]:
data = data.drop("dtion", axis = 1)

In [56]:
data

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,45,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


In [57]:
# the corr() method calculates the relations between each column in the dataset
# ignores non numeric column
data.corr()

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
Duration,1.0,0.2177,-0.183526,-0.391753,0.350969
Date,0.2177,1.0,-0.380088,-0.549973,-0.368101
Pulse,-0.183526,-0.380088,1.0,0.200177,0.503243
Maxpulse,-0.391753,-0.549973,0.200177,1.0,0.338515
Calories,0.350969,-0.368101,0.503243,0.338515,1.0


In [58]:
# Visualization: You specify the chart you want with the "kind" argument