# Data Cleaning

In [11]:
import pandas as pd

In [12]:
df = pd.read_csv('AAPL.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.100323,469033600
1,1980-12-15,0.12221,0.12221,0.121652,0.121652,0.095089,175884800
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.08811,105728000
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.090291,86441600
4,1980-12-18,0.118862,0.11942,0.118862,0.118862,0.092908,73449600


In [13]:
df.shape

(10409, 7)

In [14]:
# This method prints information about a DataFrame including the index dtype and columns, non-null values and memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10409 entries, 0 to 10408
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       10409 non-null  object 
 1   Open       10409 non-null  float64
 2   High       10409 non-null  float64
 3   Low        10409 non-null  float64
 4   Close      10409 non-null  float64
 5   Adj Close  10409 non-null  float64
 6   Volume     10409 non-null  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 569.4+ KB


In [15]:
# to check summary stats
df.describe().T.apply(lambda x: x.apply("{0:.3f}".format))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Open,10409.0,13.96,30.169,0.05,0.282,0.469,14.218,182.63
High,10409.0,14.112,30.515,0.05,0.288,0.478,14.364,182.94
Low,10409.0,13.809,29.835,0.049,0.275,0.46,14.044,179.12
Close,10409.0,13.967,30.192,0.049,0.281,0.469,14.206,182.01
Adj Close,10409.0,13.35,29.911,0.038,0.235,0.387,12.188,181.778
Volume,10409.0,332177847.171,339334418.573,0.0,124760400.0,219968000.0,412610800.0,7421640800.0


In [16]:
# determining how many duplicates are in the dataset, if any
print(df.duplicated().sum())

0


In [17]:
# determining how many null values are in the dataset, if any
print(df.isnull().sum())

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


In [18]:
# ensuring that the values in the "Date" column are in datetime format
df["Date"] = pd.to_datetime(df["Date"])

In [19]:
# setting the "Date" column as the index
df = df.set_index("Date")
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-12-12,0.128348,0.128906,0.128348,0.128348,0.100323,469033600
1980-12-15,0.12221,0.12221,0.121652,0.121652,0.095089,175884800
1980-12-16,0.113281,0.113281,0.112723,0.112723,0.08811,105728000
1980-12-17,0.115513,0.116071,0.115513,0.115513,0.090291,86441600
1980-12-18,0.118862,0.11942,0.118862,0.118862,0.092908,73449600


In [22]:
# exporting the cleaned data into a new csv file "AAPL_clean.csv" so that it can be used across different files
df.to_csv('AAPL_clean.csv')
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-03-18,160.509995,164.479996,159.759995,163.979996,163.979996,123351200
2022-03-21,163.509995,166.350006,163.009995,165.380005,165.380005,95811400
2022-03-22,165.509995,169.419998,164.910004,168.820007,168.820007,81532000
2022-03-23,167.990005,172.639999,167.649994,170.210007,170.210007,98062700
2022-03-24,171.059998,174.139999,170.210007,174.070007,174.070007,90018700
