In [1]:
import pandas as pd 
import warnings

## Capital IQ

CapIQ (short for Capital IQ) is a market intelligence platform designed by Standard & Poor’s (S&P).  The platform is widely used in many areas of corporate finance, including investment banking, equity research, asset management, and more.

The Capital IQ platform provides research, data, and analysis on private and public companies to help finance professionals perform an analysis.  This analysis may support transactions such as mergers and acquisitions, as well as investment recommendations, such as those made by equity research [Source](https://corporatefinanceinstitute.com/resources/valuation/capiq/)

### 1.US GDP 1947-2022 - Quarterly release

In [2]:
df_gdp_us = pd.read_excel(r".\raw data\GDP_US.xls", sheet_name = "Series Values")
df_gdp_us

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,,,,
1,,,,,,
2,,,,,,
3,> Series Values,,,,,
4,,,,,,
...,...,...,...,...,...,...
306,1947-09-30 00:00:00,2024834,(0.21)%,-,(0.82)%,Actual
307,1947-06-30 00:00:00,2029024,(0.27)%,-,(1.06)%,Actual
308,1947-03-31 00:00:00,2034450,-,-,-,Actual
309,,,,,,


In [3]:
# Get rid of NaN values 
df_gdp_us.dropna(inplace=True) # get rid of Nan Values 

# Rename the column with the right name
df_gdp_us.rename(columns={'Unnamed: 0': 'Period',
                            'Unnamed: 1': 'Value',
                            'Unnamed: 2' : 'Simple Growth Rate %',
                            'Unnamed: 3' : 'Y-o-Y Change %',
                            'Unnamed: 4' : 'Annual % Rate',
                            'Unnamed: 5' : 'Type'},
                              inplace=True, errors='raise')

# drop the row with the column name
df_gdp_us = df_gdp_us.drop([df_gdp_us.index[0]])

# Reset the index 
df_gdp_us.reset_index(drop=True, inplace=True)
df_gdp_us

#replacing the "-" with "0" and the "()" with "-" 
df_gdp_us['Simple Growth Rate %'] = df_gdp_us['Simple Growth Rate %'].str.replace("-","0")
df_gdp_us['Simple Growth Rate %'] = df_gdp_us['Simple Growth Rate %'].str.replace("(","-")
df_gdp_us['Simple Growth Rate %'] = df_gdp_us['Simple Growth Rate %'].str.replace(")","")
df_gdp_us['Simple Growth Rate %'] = df_gdp_us['Simple Growth Rate %'].str.replace("%","")

#replacing the "-" with "0" and the "()" with "-" 
df_gdp_us['Annual % Rate'] = df_gdp_us['Annual % Rate'].str.replace("-","0")
df_gdp_us['Annual % Rate'] = df_gdp_us['Annual % Rate'].str.replace("(","-")
df_gdp_us['Annual % Rate'] = df_gdp_us['Annual % Rate'].str.replace(")","")
df_gdp_us['Annual % Rate'] = df_gdp_us['Annual % Rate'].str.replace("%","")


#replacing the "-" with "0" and the "()" with "-" 
df_gdp_us['Y-o-Y Change %'] = df_gdp_us['Y-o-Y Change %'].str.replace("-","0")
df_gdp_us['Y-o-Y Change %'] = df_gdp_us['Y-o-Y Change %'].str.replace("(","-")
df_gdp_us['Y-o-Y Change %'] = df_gdp_us['Y-o-Y Change %'].str.replace(")","")
df_gdp_us['Y-o-Y Change %'] = df_gdp_us['Y-o-Y Change %'].str.replace("%","")

# changing the type of data 
df_gdp_us['Simple Growth Rate %'] = df_gdp_us['Simple Growth Rate %'].str.rstrip('%').astype('float') / 100.0
df_gdp_us['Annual % Rate'] = df_gdp_us['Annual % Rate'].str.rstrip('%').astype('float') / 100.0
df_gdp_us['Y-o-Y Change %'] = df_gdp_us['Y-o-Y Change %'].str.rstrip('%').astype('float') / 100.0
df_gdp_us['Value'] = df_gdp_us['Value'].astype(str).astype(int)


#delete the last column  
df_gdp_us.drop(['Type'], axis = 1, inplace = True) 

# Transform the date
df_gdp_us["Period"] = pd.to_datetime(df_gdp_us["Period"])
df_gdp_us

  df_gdp_us['Simple Growth Rate %'] = df_gdp_us['Simple Growth Rate %'].str.replace("(","-")
  df_gdp_us['Simple Growth Rate %'] = df_gdp_us['Simple Growth Rate %'].str.replace(")","")
  df_gdp_us['Annual % Rate'] = df_gdp_us['Annual % Rate'].str.replace("(","-")
  df_gdp_us['Annual % Rate'] = df_gdp_us['Annual % Rate'].str.replace(")","")
  df_gdp_us['Y-o-Y Change %'] = df_gdp_us['Y-o-Y Change %'].str.replace("(","-")
  df_gdp_us['Y-o-Y Change %'] = df_gdp_us['Y-o-Y Change %'].str.replace(")","")


Unnamed: 0,Period,Value,Simple Growth Rate %,Y-o-Y Change %,Annual % Rate
0,2022-09-30,20021700,0.0064,0.0177,0.0257
1,2022-06-30,19895271,-0.0014,0.0180,-0.0058
2,2022-03-31,19924100,-0.0041,0.0368,-0.0163
3,2021-12-31,20006181,0.0170,0.0572,0.0696
4,2021-09-30,19672600,0.0066,0.0496,0.0265
...,...,...,...,...,...
298,1948-03-31,2087442,0.0150,0.0260,0.0615
299,1947-12-31,2056508,0.0156,0.0000,0.0641
300,1947-09-30,2024834,-0.0021,0.0000,-0.0082
301,1947-06-30,2029024,-0.0027,0.0000,-0.0106


In [4]:
df_gdp_us.dtypes

Period                  datetime64[ns]
Value                            int32
Simple Growth Rate %           float64
Y-o-Y Change %                 float64
Annual % Rate                  float64
dtype: object

In [5]:
df_gdp_us.to_csv (r'C:\Users\rapha\OneDrive\ironhack\projects\Project_Final_Ironhack\cleaned data\df_gdp_us.csv', index = None, header=True) 

### 2. US CPI 1914-2022 - Monthly basis 

In [6]:
df_cpi_us = pd.read_excel(r".\raw data\CPI Y o Y United States of America Monthly Economic Time Series Profile.xls", sheet_name = "Series Values")
df_cpi_us

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,,,,
1,,,,,,
2,,,,,,
3,> Series Values,,,,,
4,,,,,,
...,...,...,...,...,...,...
1309,1914-03-31 00:00:00,1.02,NM,NM,NM,Actual
1310,1914-02-28 00:00:00,1.02,NM,NM,NM,Actual
1311,1914-01-31 00:00:00,2.04,NM,NM,NM,Actual
1312,,,,,,


In [7]:
# Get rid of NaN values 
df_cpi_us.dropna(inplace=True) # get rid of Nan Values 



# Rename the column with the right name
df_cpi_us.rename(columns={'Unnamed: 0': 'Period',
                            'Unnamed: 1': 'Value',
                            'Unnamed: 2' : 'Simple Growth Rate %',
                            'Unnamed: 3' : 'Y-o-Y Change %',
                            'Unnamed: 4' : 'Annual % Rate',
                            'Unnamed: 5' : 'Type'},
                              inplace=True, errors='raise')

# drop the row with the column name
df_cpi_us = df_cpi_us.drop([df_cpi_us.index[0]])

# Reset the index 
df_cpi_us.reset_index(drop=True, inplace=True)
df_cpi_us

#delete columns  
df_cpi_us.drop(['Type', 'Simple Growth Rate %', "Y-o-Y Change %", "Annual % Rate"], axis = 1, inplace = True) 

# Transform the date
df_cpi_us["Period"] = pd.to_datetime(df_cpi_us["Period"])
df_cpi_us

Unnamed: 0,Period,Value
0,2022-10-31,7.75
1,2022-09-30,8.2
2,2022-08-31,8.26
3,2022-07-31,8.52
4,2022-06-30,9.06
...,...,...
1301,1914-05-31,2.06
1302,1914-04-30,0
1303,1914-03-31,1.02
1304,1914-02-28,1.02


In [8]:
df_cpi_us.dtypes

Period    datetime64[ns]
Value             object
dtype: object

In [9]:
df_cpi_us.to_csv (r'C:\Users\rapha\OneDrive\ironhack\projects\Project_Final_Ironhack\cleaned data\df_cpi_us.csv', index = None, header=True) 

### 3. US Unemployment Rate 1948-2022 - Monthly basis  

In [10]:
df_unr_us = pd.read_excel(r".\raw data\Economic Time Series Profile.xls", sheet_name = "Series Values")
df_unr_us

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,,,,
1,,,,,,
2,,,,,,
3,> Series Values,,,,,
4,,,,,,
...,...,...,...,...,...,...
901,1948-03-31 00:00:00,4,NM,NM,NM,Actual
902,1948-02-29 00:00:00,3.8,NM,NM,NM,Actual
903,1948-01-31 00:00:00,3.4,NM,NM,NM,Actual
904,,,,,,


In [11]:
# Get rid of NaN values 
df_unr_us.dropna(inplace=True) # get rid of Nan Values 



# Rename the column with the right name
df_unr_us.rename(columns={'Unnamed: 0': 'Period',
                            'Unnamed: 1': 'Value',
                            'Unnamed: 2' : 'Simple Growth Rate %',
                            'Unnamed: 3' : 'Y-o-Y Change %',
                            'Unnamed: 4' : 'Annual % Rate',
                            'Unnamed: 5' : 'Type'},
                              inplace=True, errors='raise')

# drop the row with the column name
df_unr_us = df_unr_us.drop([df_unr_us.index[0]])


# Reset the index 
df_unr_us.reset_index(drop=True, inplace=True)
df_unr_us

#delete columns  
df_unr_us.drop(['Type', 'Simple Growth Rate %', "Y-o-Y Change %", "Annual % Rate"], axis = 1, inplace = True)

# Transform the date
df_unr_us["Period"] = pd.to_datetime(df_unr_us["Period"])
df_unr_us

Unnamed: 0,Period,Value
0,2022-10-31,3.7
1,2022-09-30,3.5
2,2022-08-31,3.7
3,2022-07-31,3.5
4,2022-06-30,3.6
...,...,...
893,1948-05-31,3.5
894,1948-04-30,3.9
895,1948-03-31,4
896,1948-02-29,3.8


In [12]:
df_unr_us.dtypes

Period    datetime64[ns]
Value             object
dtype: object

In [13]:
df_unr_us.to_csv (r'C:\Users\rapha\OneDrive\ironhack\projects\Project_Final_Ironhack\cleaned data\df_unr_us.csv', index = None, header=True) 

### 4. S&P 500 1928-2022 - Daily Basis 

In [14]:
df_SP500 = pd.read_excel(r".\raw data\Charting Excel Export - Nov 23rd 2022 1_04_45 pm.xls", sheet_name = "Pane 1")
df_SP500

Unnamed: 0.1,Unnamed: 0,Unnamed: 1
0,,
1,,
2,,
3,,
4,,
...,...,...
24652,2022-11-16 00:00:00,3958.793388
24653,2022-11-17 00:00:00,3946.557985
24654,2022-11-18 00:00:00,3965.339301
24655,2022-11-21 00:00:00,3949.937176


In [15]:
# Get rid of NaN values 
df_SP500.dropna(inplace=True) # get rid of Nan Values 

# Rename the column with the right name
df_SP500.rename(columns={'Unnamed: 0': 'Dates',
                            'Unnamed: 1': 'S&P 500 (^SPX) - Index Value'},
                              inplace=True, errors='raise')

# drop the row with the column name
df_SP500 = df_SP500.drop([df_SP500.index[0]])

# Reset the index 
df_SP500.reset_index(drop=True, inplace=True)
df_SP500

# Transform the date
#df_SP500["Dates"] = pd.to_datetime(df_SP500["Dates"])

# Getting rid of the day and keep only year and month
df_SP500['Dates'] = df_SP500['Dates'].apply(lambda x: x.strftime('%Y-%m'))

# changing the data type of the date and index value to the correct ones 
df_SP500["S&P 500 (^SPX) - Index Value"] =df_SP500["S&P 500 (^SPX) - Index Value"].astype(float)
df_SP500['Dates'] = pd.to_datetime(df_SP500['Dates'])
df_SP500.dtypes

# taking the mean of the different 
df_SP500 = df_SP500.resample('M', on='Dates').mean()

# inserating again the index
df_SP500 = df_SP500.reset_index()

# to keep only the month in the final version of the df
df_SP500['Dates'] = df_SP500['Dates'].apply(lambda x: x.strftime('%Y-%m'))
df_SP500['Dates'] = pd.to_datetime(df_SP500['Dates'])

df_SP500

Unnamed: 0,Dates,S&P 500 (^SPX) - Index Value
0,1928-01-01,17.526000
1,1928-02-01,17.317391
2,1928-03-01,18.246667
3,1928-04-01,19.398182
4,1928-05-01,20.004091
...,...,...
1134,2022-07-01,3911.730188
1135,2022-08-01,4158.563134
1136,2022-09-01,3850.521802
1137,2022-10-01,3726.050515


In [16]:
df_SP500.dtypes

Dates                           datetime64[ns]
S&P 500 (^SPX) - Index Value           float64
dtype: object

In [17]:
df_SP500.to_csv (r'C:\Users\rapha\OneDrive\ironhack\projects\Project_Final_Ironhack\cleaned data\df_SP500.csv', index = None, header=True) 

## Refinitiv

Refinitiv Workspace provides access to company financial data and economic indicators as well as news, analytics and productivity tools. The database incorporates both Datastream/Eikon and ThomsonOne.com which were previously on separate platforms [Source](https://library.bath.ac.uk/refinitvworkspace#:~:text=What%20is%20Refinitv%20Workspace%3F,were%20previously%20on%20separate%20platforms)

### 5. Monthly Home Price Index  2008-2022 - Monthly basis

In [18]:
df_mhpi_us = pd.read_excel(r".\raw data\Economic Indicator_United States Monthly Home Price Index__24 Nov 2022.xlsx", sheet_name = "First Release Data")
df_mhpi_us

Unnamed: 0,First Release Data,Unnamed: 1,Unnamed: 2
0,,,
1,Period,Actual,Actual
2,Period,Original Release Date,First Release
3,Aug 2022,25 Oct 2022 15:00,392.0
4,Jul 2022,27 Sep 2022 15:00,395.2
...,...,...,...
378,May 1991,,
379,Apr 1991,,
380,Mar 1991,,
381,Feb 1991,,


In [19]:
# Rename the column with the right name
df_mhpi_us.rename(columns={'First Release Data': 'Period',
                         'Unnamed: 1' : 'Original Release Date',
                        'Unnamed: 2': 'First Release'},
                              inplace=True, errors='raise')

# drop the row with the column name
df_mhpi_us.drop(index=[0,1,2], inplace=True)

# Get rid of NaN values 
df_mhpi_us.dropna(inplace=True) # get rid of Nan Values 

# Reset the index 
df_mhpi_us.reset_index(drop=True, inplace=True)
df_mhpi_us

# transform the object number to a normal float
df_mhpi_us['First Release'] = df_mhpi_us['First Release'].astype(float)

# Transform the date
df_mhpi_us["Period"] = pd.to_datetime(df_mhpi_us["Period"])

# Drop the release date column that is unecessary
df_mhpi_us.drop(["Original Release Date"], axis=1, inplace=True)
df_mhpi_us

Unnamed: 0,Period,First Release
0,2022-08-01,392.0
1,2022-07-01,395.2
2,2022-06-01,398.0
3,2022-05-01,398.1
4,2022-04-01,392.9
...,...,...
165,2008-11-01,200.7
166,2008-10-01,204.5
167,2008-09-01,206.8
168,2008-08-01,210.1


In [20]:
df_mhpi_us.dtypes

Period           datetime64[ns]
First Release           float64
dtype: object

In [21]:
df_mhpi_us.to_csv (r'C:\Users\rapha\OneDrive\ironhack\projects\Project_Final_Ironhack\cleaned data\df_mhpi_us.csv', index = None, header=True) 

### 6. Initial Jobless Claim 2004-2022 - Weekly basis 

In [22]:
df_ijc_us = pd.read_excel(r".\raw data\Economic Indicator_United States Initial Jobless Clm__24 Nov 2022.xlsx", sheet_name = "First Release Data")
df_ijc_us

Unnamed: 0,First Release Data,Unnamed: 1,Unnamed: 2
0,,,
1,Period,Actual,Actual
2,Period,Original Release Date,First Release
3,19 Nov 2022,23 Nov 2022 14:30,240k
4,12 Nov 2022,17 Nov 2022 14:30,222k
...,...,...,...
2914,4 Feb 1967,,
2915,28 Jan 1967,,
2916,21 Jan 1967,,
2917,14 Jan 1967,,


In [23]:
# Rename the column with the right name
df_ijc_us.rename(columns={'First Release Data': 'Period',
                         'Unnamed: 1' : 'Original Release Date',
                        'Unnamed: 2': 'First Release'},
                              inplace=True, errors='raise')

# drop the row with the column name
df_ijc_us.drop(index=[0,1,2], inplace=True)

# Get rid of NaN values 
df_ijc_us.dropna(inplace=True) # get rid of Nan Values 

# Reset the index 
df_ijc_us.reset_index(drop=True, inplace=True)
df_ijc_us

# Cleaning the period column by removing uncessary "k" ","
df_ijc_us['First Release'] = df_ijc_us['First Release'].str.replace('k','000')
df_ijc_us['First Release'] = df_ijc_us['First Release'].str.replace(',','')

# transform the object number to a normal float
df_ijc_us['First Release'] = df_ijc_us['First Release'].astype(float)

# Transform the date
df_ijc_us["Period"] = pd.to_datetime(df_ijc_us["Period"])

# Drop the release date column that is unecessary
df_ijc_us.drop(["Original Release Date"], axis=1, inplace=True)
df_ijc_us

Unnamed: 0,Period,First Release
0,2022-11-19,240000.0
1,2022-11-12,222000.0
2,2022-11-05,225000.0
3,2022-10-29,217000.0
4,2022-10-22,217000.0
...,...,...
962,2004-06-12,336000.0
963,2004-06-05,352000.0
964,2004-05-29,339000.0
965,2004-05-22,344000.0


In [24]:
df_ijc_us.dtypes

Period           datetime64[ns]
First Release           float64
dtype: object

In [25]:
df_ijc_us.to_csv (r'C:\Users\rapha\OneDrive\ironhack\projects\Project_Final_Ironhack\cleaned data\df_ijc_us.csv', index = None, header=True) 

### 7. Core CPI 1996-2022 - Monthly Basis  

In [26]:
df_ccpi_us = pd.read_excel(r".\raw data\Economic Indicator_United States Core CPI Index, SA__24 Nov 2022.xlsx", sheet_name = "First Release Data")
df_ccpi_us

Unnamed: 0,First Release Data,Unnamed: 1,Unnamed: 2
0,,,
1,Period,Actual,Actual
2,Period,Original Release Date,First Release
3,Oct 2022,10 Nov 2022 14:37,299.47
4,Sep 2022,13 Oct 2022 14:31,298.66
...,...,...,...
788,May 1957,,
789,Apr 1957,,
790,Mar 1957,,
791,Feb 1957,,


In [27]:
# Rename the column with the right name
df_ccpi_us.rename(columns={'First Release Data': 'Period',
                         'Unnamed: 1' : 'Original Release Date',
                        'Unnamed: 2': 'First Release'},
                              inplace=True, errors='raise')

# drop the row with the column name
df_ccpi_us.drop(index=[0,1,2], inplace=True)

# Get rid of NaN values 
df_ccpi_us.dropna(inplace=True) # get rid of Nan Values 

# Reset the index 
df_ccpi_us.reset_index(drop=True, inplace=True)
df_ccpi_us

# Cleaning the period column by removing uncessary "k" ","
df_ccpi_us['First Release'] = df_ccpi_us['First Release'].str.replace('k','000')
df_ccpi_us['First Release'] = df_ccpi_us['First Release'].str.replace(',','')

# transform the object number to a normal float
df_ccpi_us['First Release'] = df_ccpi_us['First Release'].astype(float)

# Transform the date
df_ccpi_us["Period"] = pd.to_datetime(df_ccpi_us["Period"])

# Drop the release date column that is unecessary
df_ccpi_us.drop(["Original Release Date"], axis=1, inplace=True)
df_ccpi_us

Unnamed: 0,Period,First Release
0,2022-10-01,299.47
1,2022-09-01,298.66
2,2022-08-01,296.95
3,2022-07-01,295.28
4,2022-06-01,294.35
...,...,...
307,1997-03-01,168.70
308,1997-02-01,168.30
309,1997-01-01,167.90
310,1996-12-01,167.60


In [28]:
df_ccpi_us.to_csv (r'C:\Users\rapha\OneDrive\ironhack\projects\Project_Final_Ironhack\cleaned data\df_ccpi_us.csv', index = None, header=True) 

### 8. Consumer Credit - 2004-2022 - Monthly basis 

In [29]:
df_concred_us = pd.read_excel(r".\raw data\Economic Indicator_United States Consumer Credit__24 Nov 2022.xlsx", sheet_name = "First Release Data")
df_concred_us

Unnamed: 0,First Release Data,Unnamed: 1,Unnamed: 2
0,,,
1,Period,Actual,Actual
2,Period,Original Release Date,First Release
3,Sep 2022,7 Nov 2022 21:00,24.98b
4,Aug 2022,7 Oct 2022 21:00,32.24b
...,...,...,...
954,Jun 1943,,
955,May 1943,,
956,Apr 1943,,
957,Mar 1943,,


In [30]:
# Rename the column with the right name
df_concred_us.rename(columns={'First Release Data': 'Period',
                         'Unnamed: 1' : 'Original Release Date',
                        'Unnamed: 2': 'First Release'},
                              inplace=True, errors='raise')

# drop the row with the column name
df_concred_us.drop(index=[0,1,2], inplace=True)

# Get rid of NaN values 
df_concred_us.dropna(inplace=True) # get rid of Nan Values 

# Reset the index 
df_concred_us.reset_index(drop=True, inplace=True)
df_concred_us

# Cleaning the period column by removing uncessary "k" ","
df_concred_us['First Release'] = df_concred_us['First Release'].str.replace('b','0000000')
df_concred_us['First Release'] = df_concred_us['First Release'].str.replace('.','')

# transform the object number to a normal float
df_concred_us['First Release'] = df_concred_us['First Release'].astype(float)

# Transform the date
df_concred_us["Period"] = pd.to_datetime(df_concred_us["Period"])

# Drop the release date column that is unecessary
df_concred_us.drop(["Original Release Date"], axis=1, inplace=True)
df_concred_us

  df_concred_us['First Release'] = df_concred_us['First Release'].str.replace('.','')


Unnamed: 0,Period,First Release
0,2022-09-01,2.498000e+10
1,2022-08-01,3.224000e+10
2,2022-07-01,2.381000e+10
3,2022-06-01,4.015000e+10
4,2022-05-01,2.235000e+10
...,...,...
217,2004-08-01,-2.400000e+09
218,2004-07-01,1.090000e+10
219,2004-06-01,6.600000e+09
220,2004-05-01,8.200000e+09


In [31]:
df_ccpi_us.to_csv (r'C:\Users\rapha\OneDrive\ironhack\projects\Project_Final_Ironhack\cleaned data\df_ccpi_us.csv', index = None, header=True) 

### 9. Non-Farm Payrolls - 2004-2022 - Monthly basis 

In [32]:
df_nfp_us = pd.read_excel(r".\raw data\Economic Indicator_United States Consumer Credit__24 Nov 2022.xlsx", sheet_name = "First Release Data")
df_nfp_us

Unnamed: 0,First Release Data,Unnamed: 1,Unnamed: 2
0,,,
1,Period,Actual,Actual
2,Period,Original Release Date,First Release
3,Sep 2022,7 Nov 2022 21:00,24.98b
4,Aug 2022,7 Oct 2022 21:00,32.24b
...,...,...,...
954,Jun 1943,,
955,May 1943,,
956,Apr 1943,,
957,Mar 1943,,


In [33]:
# Rename the column with the right name
df_nfp_us.rename(columns={'First Release Data': 'Period',
                         'Unnamed: 1' : 'Original Release Date',
                        'Unnamed: 2': 'First Release'},
                              inplace=True, errors='raise')

# drop the row with the column name
df_nfp_us.drop(index=[0,1,2], inplace=True)

# Get rid of NaN values 
df_nfp_us.dropna(inplace=True) # get rid of Nan Values 

# Reset the index 
df_nfp_us.reset_index(drop=True, inplace=True)
df_nfp_us

# Cleaning the period column by removing uncessary "k" ","
df_nfp_us['First Release'] = df_nfp_us['First Release'].str.replace('k','000')
df_nfp_us['First Release'] = df_nfp_us['First Release'].str.replace(',','')

# transform the object number to a normal float
df_nfp_us['First Release'] = df_nfp_us['First Release'].astype(float)

# Transform the date
df_nfp_us["Period"] = pd.to_datetime(df_nfp_us["Period"])

# Drop the release date column that is unecessary
df_nfp_us.drop(["Original Release Date"], axis=1, inplace=True)
df_nfp_us

ValueError: could not convert string to float: '24.98b'

In [None]:
df_nfp_us.to_csv (r'C:\Users\rapha\OneDrive\ironhack\projects\Project_Final_Ironhack\cleaned data\df_nfp_us.csv', index = None, header=True) 

### 10. Chicago PMI 2004-2022 - Monthly basis  

In [None]:
df_cpmi_us = pd.read_excel(r".\raw data\Economic Indicator_United States Chicago PMI__24 Nov 2022.xlsx", sheet_name = "First Release Data")
df_cpmi_us

In [None]:
# Rename the column with the right name
df_cpmi_us.rename(columns={'First Release Data': 'Period',
                         'Unnamed: 1' : 'Original Release Date',
                        'Unnamed: 2': 'First Release'},
                              inplace=True, errors='raise')

# drop the row with the column name
df_cpmi_us.drop(index=[0,1,2], inplace=True)

# Get rid of NaN values 
df_cpmi_us.dropna(inplace=True) # get rid of Nan Values 

# Reset the index 
df_cpmi_us.reset_index(drop=True, inplace=True)
df_cpmi_us

# transform the object number to a normal float
df_cpmi_us['First Release'] = df_cpmi_us['First Release'].astype(float)

# Transform the date
df_cpmi_us["Period"] = pd.to_datetime(df_cpmi_us["Period"])

# Drop the release date column that is unecessary
df_cpmi_us.drop(["Original Release Date"], axis=1, inplace=True)
df_cpmi_us

In [None]:
df_cpmi_us.to_csv (r'C:\Users\rapha\OneDrive\ironhack\projects\Project_Final_Ironhack\cleaned data\df_cpmi_us.csv', index = None, header=True) 