# Financial Times

In [43]:
import pandas as pd

# List of CSV files to concatenate
csv_files = [
    "ft_oil_articles.csv",
    "ft_oil_articles_part_2.csv",
    "ft_oil_articles_part_3.csv",
]

# List to hold DataFrames
dataframes = []

# Read each CSV file into a DataFrame and append to the list
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df

Unnamed: 0,Headline,Date
0,Vitol plans metals move to hedge against decli...,"Thursday, 7 November, 2024"
1,Europe’s winter gas supplies at risk from mark...,"Thursday, 7 November, 2024"
2,Big Oil’s dirty legacy in Nigeria,"Wednesday, 6 November, 2024"
3,Lukoil considered sale of Bulgarian refinery t...,"Tuesday, 5 November, 2024"
4,Trudeau government under fire from all sides o...,"Tuesday, 5 November, 2024"
...,...,...
3765,UK gas prices surge on Asia LNG rally,"Sunday, 20 December, 2020"
3766,Third of low-carbon funds invest in oil and ga...,"Sunday, 20 December, 2020"
3767,Hurricane Energy warns shareholders could be l...,"Friday, 18 December, 2020"
3768,Trump’s grandstanding in the Arctic,"Thursday, 17 December, 2020"


In [44]:
combined_df = combined_df [(combined_df ['Date'] != "No date") & (combined_df ['Headline'] != "No headline")]   

# Sort the DataFrame by Date to ensure chronological order
combined_df['Date'] = pd.to_datetime(combined_df['Date'], format='%A, %d %B, %Y')  # Adjust the format as necessary


# Combine headlines for duplicate dates
df_grouped = combined_df.groupby('Date', as_index=False).agg({'Headline': '. '.join})




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['Date'] = pd.to_datetime(combined_df['Date'], format='%A, %d %B, %Y')  # Adjust the format as necessary


In [45]:
df_grouped

Unnamed: 0,Date,Headline
0,2020-12-17,Trump’s grandstanding in the Arctic. Cairn to ...
1,2020-12-18,Hurricane Energy warns shareholders could be l...
2,2020-12-20,UK gas prices surge on Asia LNG rally. Third o...
3,2020-12-21,Shell to take further writedown after bruising...
4,2020-12-22,UK Oil & Gas wins dispute with climate activis...
...,...,...
1144,2024-11-03,Opec+ members delay planned rise in oil produc...
1145,2024-11-04,AI’s huge power needs give oil majors incentiv...
1146,2024-11-05,Lukoil considered sale of Bulgarian refinery t...
1147,2024-11-06,Big Oil’s dirty legacy in Nigeria


In [46]:
start_date = pd.Timestamp('2021-01-04')
end_date = pd.Timestamp('2024-10-10')

final_df = df_grouped[(df_grouped['Date'] >= start_date) & (df_grouped['Date'] <= end_date)]
final_df.head(10)

Unnamed: 0,Date,Headline
12,2021-01-04,Saudi Arabia warns oil producers over fragile ...
13,2021-01-05,Saudis pledge to cut oil output despite Russia...
14,2021-01-06,Oil auction in Arctic wildlife refuge draws sc...
15,2021-01-07,A Democratic Senate provides a surge for Biden...
16,2021-01-08,UK’s FTSE 100 kicks off 2021 with 6% rally lif...
17,2021-01-10,Gazprom to restart Nord Stream 2 construction
18,2021-01-11,Wind power overtakes coal in Texas electricity...
19,2021-01-12,Climate activists choke Appalachian pipeline e...
20,2021-01-13,Hydrogen project aims to drive UK transition t...
21,2021-01-14,How Mexico’s Pemex went from cash cow to finan...


In [47]:
full_date_range = pd.date_range(start=start_date, end=end_date)

# Create a DataFrame with the full date range
df_full = pd.DataFrame({"Date": full_date_range})

# Merge with the grouped DataFrame
df_full = df_full.merge(final_df, on="Date", how="left")

# Forward fill missing headlines
df_full["Headline"] = df_full["Headline"].fillna(method="ffill")
df_full.head(10)


  df_full["Headline"] = df_full["Headline"].fillna(method="ffill")


Unnamed: 0,Date,Headline
0,2021-01-04,Saudi Arabia warns oil producers over fragile ...
1,2021-01-05,Saudis pledge to cut oil output despite Russia...
2,2021-01-06,Oil auction in Arctic wildlife refuge draws sc...
3,2021-01-07,A Democratic Senate provides a surge for Biden...
4,2021-01-08,UK’s FTSE 100 kicks off 2021 with 6% rally lif...
5,2021-01-09,UK’s FTSE 100 kicks off 2021 with 6% rally lif...
6,2021-01-10,Gazprom to restart Nord Stream 2 construction
7,2021-01-11,Wind power overtakes coal in Texas electricity...
8,2021-01-12,Climate activists choke Appalachian pipeline e...
9,2021-01-13,Hydrogen project aims to drive UK transition t...


In [48]:
df_full.to_csv('final_datasets/ft_final.csv', index=False)

# CNBC

In [49]:
df = pd.read_csv("cnbc.csv")
df

Unnamed: 0,Headline,Date
0,Oil prices firm as geopolitical tensions raise...,11/20/2024 8:59:26 PM ET
1,"Oil closes down on strong U.S. supply, losses ...",11/20/2024 7:36:40 AM ET
2,One of our lagging stocks has found its stride...,11/19/2024 12:00:39 PM ET
3,Crude oil prices steady after Ukraine hits Rus...,11/19/2024 7:51:14 AM ET
4,First-ever Capital One Landing lounge experien...,11/19/2024 7:00:01 AM ET
...,...,...
3877,OPEC+ reportedly nearing compromise on holding...,1/5/2021 12:50:27 AM ET
3878,Shares of Chinese telecom giants surge followi...,1/4/2021 6:40:03 PM ET
3879,Why the Fed’s inflation push could turn from f...,1/4/2021 3:31:16 PM ET
3880,South Korean and China’s Shenzhen stocks surge...,1/3/2021 6:37:58 PM ET


In [50]:
# Convert the Date column to datetime and only keep year, month, and day
df["Date"] = pd.to_datetime(df["Date"]).dt.date

# Define the date range
start_date = pd.to_datetime("2021-01-04").date()
end_date = pd.to_datetime("2024-10-10").date()

# Create a complete date range
all_dates = pd.date_range(start=start_date, end=end_date).date

# Filter and group data by date, concatenating headlines
df = df[(df["Date"] >= start_date) & (df["Date"] <= end_date)]
df = df.groupby("Date")["Headline"].apply(". ".join).reset_index()

# Merge with the full date range, forward-filling missing headlines
df_full = pd.DataFrame({"Date": all_dates})
df_full = df_full.merge(df, on="Date", how="left")
df_full["Headline"] = df_full["Headline"].fillna(method="ffill")
df_full


  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date
  df["Date"] = pd.to_datetime(df["Date"]).dt.date


Unnamed: 0,Date,Headline
0,2021-01-04,Shares of Chinese telecom giants surge followi...
1,2021-01-05,Oil prices will remain flat until mid-2021: JB...
2,2021-01-06,Oil rises as supply constraints retain focus a...
3,2021-01-07,"Oil hits highest level since February, posts n..."
4,2021-01-08,"Oil hits highest level since February, posts n..."
...,...,...
1371,2024-10-06,Oil pares gains after strongest weekly rise in...
1372,2024-10-07,Oil edges lower after rallying to highest in o...
1373,2024-10-08,Oil prices steady after sliding on potential I...
1374,2024-10-09,Oil prices rise more than 3% on Israel-Iran co...


In [None]:
# Save the processed dataset to a CSV file
df_full.to_csv("final_datasets/cnbc_final.csv", index=False)

# OilPrice.com

In [64]:
df = pd.read_csv("oilprice_new.csv")
df

Unnamed: 0,Headline,Date
0,Oil Prices Drop Dramatically After Israel's Li...,"Oct 28, 2024 at 04:01"
1,Saudi Oil Export Revenues Hit Three-Year Low a...,"Oct 24, 2024 at 09:46"
2,Standard Chartered: Global Oil Demand Hit An A...,"Oct 23, 2024 at 14:22"
3,Oil Prices Fluctuate Amidst China's Economic G...,"Oct 18, 2024 at 03:05"
4,Wild Oil Price Forecasts: Some Predict $350 if...,"Oct 09, 2024 at 12:37"
...,...,...
594,Oil Rally Continues Despite Slow Vaccine Rollout,"Jan 08, 2021 at 11:14"
595,The Next 5 Days Could See A Buying Spree In Oi...,"Jan 07, 2021 at 09:58"
596,OPEC+ Meeting Ends With Major Surprise Cut Fro...,"Jan 05, 2021 at 12:46"
597,Oil Rallies As Russia Agrees With OPEC+ Not To...,"Jan 05, 2021 at 09:16"


In [65]:
# Convert the Date column to datetime and format it to only include month, day, and year
df["Date"] = pd.to_datetime(df["Date"], format="%b %d, %Y at %H:%M").dt.date

# Define the date range
start_date = pd.to_datetime("2021-01-04").date()
end_date = pd.to_datetime("2024-10-10").date()

# Create a complete date range
all_dates = pd.date_range(start=start_date, end=end_date).date

# Filter data to only include rows within the specified date range
df = df[(df["Date"] >= start_date) & (df["Date"] <= end_date)]

# Group data by date, concatenating headlines with ". "
df_grouped = df.groupby("Date")["Headline"].apply(". ".join).reset_index()

# Create a DataFrame with the full date range
df_full = pd.DataFrame({"Date": all_dates})

# Merge with the grouped data, ensuring all dates are present
df_full = df_full.merge(df_grouped, on="Date", how="left")

# Forward fill missing headlines
df_full["Headline"] = df_full["Headline"].fillna(method="ffill")

# Show the final DataFrame
df_full


  df_full["Headline"] = df_full["Headline"].fillna(method="ffill")


Unnamed: 0,Date,Headline
0,2021-01-04,Oil Rises As Market Awaits OPEC+ Production De...
1,2021-01-05,OPEC+ Meeting Ends With Major Surprise Cut Fro...
2,2021-01-06,OPEC+ Meeting Ends With Major Surprise Cut Fro...
3,2021-01-07,The Next 5 Days Could See A Buying Spree In Oi...
4,2021-01-08,Oil Rally Continues Despite Slow Vaccine Rollout
...,...,...
1371,2024-10-06,Oil Explodes 4% Amid Talk of Israel Attacking ...
1372,2024-10-07,Historic Short Squeeze Sends Oil Prices Higher
1373,2024-10-08,Historic Short Squeeze Sends Oil Prices Higher
1374,2024-10-09,Wild Oil Price Forecasts: Some Predict $350 if...


In [66]:
# Save the processed dataset to a CSV file
df_full.to_csv("final_datasets/oilprice_final.csv", index=False)

# MEES

In [70]:
df = pd.read_csv("mees_opec_headlines.csv")
df

Unnamed: 0,Headline,Date
0,Oman: New Gas Output For Q3 As Oil Constrained...,"Nov 15, 2024"
1,India Crude Imports: Russia Dominant As Saudi ...,"Nov 15, 2024"
2,Opec Extends Cuts As Trump Return Complicates ...,"Nov 8, 2024"
3,China Crude Imports: Down Again,"Nov 8, 2024"
4,Brazil Oil Output: Q3 Down But More Growth To ...,"Nov 8, 2024"
...,...,...
760,Opec Aims For More Stable 2021,"Jan 4, 2021"
761,US Mideast Crude Imports Fall To New 35-Year Lows,"Jan 4, 2021"
762,Will A Resurgent Iran Flood Oil Markets In 2021?,"Dec 18, 2020"
763,DME Oman: Assessing A Key Middle East Benchmark,"Dec 18, 2020"


In [71]:
# Convert the Date column to datetime and format it to only include month, day, and year
df["Date"] = pd.to_datetime(df["Date"], format="%b %d, %Y").dt.date

# Define the date range
start_date = pd.to_datetime("2021-01-04").date()
end_date = pd.to_datetime("2024-10-10").date()

# Create a complete date range
all_dates = pd.date_range(start=start_date, end=end_date).date

# Filter data to only include rows within the specified date range
df = df[(df["Date"] >= start_date) & (df["Date"] <= end_date)]

# Group data by date, concatenating headlines with ". "
df_grouped = df.groupby("Date")["Headline"].apply(". ".join).reset_index()

# Create a DataFrame with the full date range
df_full = pd.DataFrame({"Date": all_dates})

# Merge with the grouped data, ensuring all dates are present
df_full = df_full.merge(df_grouped, on="Date", how="left")

# Forward fill missing headlines
df_full["Headline"] = df_full["Headline"].fillna(method="ffill")

# Show the final DataFrame
df_full


  df_full["Headline"] = df_full["Headline"].fillna(method="ffill")


Unnamed: 0,Date,Headline
0,2021-01-04,Opec Aims For More Stable 2021. US Mideast Cru...
1,2021-01-05,Opec Aims For More Stable 2021. US Mideast Cru...
2,2021-01-06,Opec Aims For More Stable 2021. US Mideast Cru...
3,2021-01-07,Opec Aims For More Stable 2021. US Mideast Cru...
4,2021-01-08,Algeria’s Gas Exports Slump To 25-Year Low For...
...,...,...
1371,2024-10-06,Compliance In Spotlight At Opec JMMC Meeting
1372,2024-10-07,Compliance In Spotlight At Opec JMMC Meeting
1373,2024-10-08,Compliance In Spotlight At Opec JMMC Meeting
1374,2024-10-09,Compliance In Spotlight At Opec JMMC Meeting


In [72]:
# Save the processed dataset to a CSV file
df_full.to_csv("final_datasets/mees_final.csv", index=False)

# Arab News

In [78]:
df = pd.read_csv("arabnews_articles.csv")
df

Unnamed: 0,Headline,Date
0,Oil Updates – market sees losses on tight supp...,"Wednesday 13, November 2024 10.15 am"
1,Saudi Aramco slashes December oil prices for A...,"Wednesday 6, November 2024 2.44 pm"
2,Pakistan party protests increase in petroleum ...,"Friday 1, November 2024 11.41 am"
3,Oil Updates – prices dip as geopolitical risks...,"Tuesday 22, October 2024 10.23 am"
4,Oil Updates – crude edges up from two-week low...,"Thursday 17, October 2024 9.09 am"
...,...,...
221,Quick action by OPEC+ stabilized oil markets d...,"Monday 1, March 2021 10.40 am"
222,Brent crude to hit $75 as demand recovery outs...,"Monday 22, February 2021 9.30 pm"
223,"Weekly energy recap: February 19, 2021","Saturday 20, February 2021 5.23 pm"
224,WEEKLY ENERGY RECAP: A mixed week for oil with...,"Saturday 16, January 2021 10.43 pm"


In [None]:
# Convert the Date column to datetime and format it to only include month, day, and year
df["Date"] = pd.to_datetime(df["Date"], format="%A %d, %B %Y %I.%M %p").dt.date

# Define the date range
start_date = pd.to_datetime("2021-01-04").date()
end_date = pd.to_datetime("2024-10-10").date()

# Create a complete date range
all_dates = pd.date_range(start=start_date, end=end_date).date

# Filter data to only include rows within the specified date range
df = df[(df["Date"] >= start_date) & (df["Date"] <= end_date)]

# Group data by date, concatenating headlines with ". "
df_grouped = df.groupby("Date")["Headline"].apply(". ".join).reset_index()

# Create a DataFrame with the full date range
df_full = pd.DataFrame({"Date": all_dates})

# Merge with the grouped data, ensuring all dates are present
df_full = df_full.merge(df_grouped, on="Date", how="left")

# Forward fill missing headlines
df_full["Headline"] = df_full["Headline"].fillna(method="ffill")

# Show the final DataFrame
df_full.head(20)


  df_full["Headline"] = df_full["Headline"].fillna(method="ffill")


Unnamed: 0,Date,Headline
0,2021-01-04,WEEKLY ENERGY RECAP: Oil prices on an upward m...
1,2021-01-05,WEEKLY ENERGY RECAP: Oil prices on an upward m...
2,2021-01-06,WEEKLY ENERGY RECAP: Oil prices on an upward m...
3,2021-01-07,WEEKLY ENERGY RECAP: Oil prices on an upward m...
4,2021-01-08,WEEKLY ENERGY RECAP: Oil prices on an upward m...
5,2021-01-09,WEEKLY ENERGY RECAP: Oil prices on an upward m...
6,2021-01-10,WEEKLY ENERGY RECAP: Oil prices on an upward m...
7,2021-01-11,WEEKLY ENERGY RECAP: Oil prices on an upward m...
8,2021-01-12,WEEKLY ENERGY RECAP: Oil prices on an upward m...
9,2021-01-13,WEEKLY ENERGY RECAP: Oil prices on an upward m...


In [82]:
# Save the processed dataset to a CSV file
df_full.to_csv("final_datasets/arabnews_final.csv", index=False)

# Energy Intelligence

In [90]:
import pandas as pd

# List of CSV files to concatenate
csv_files = [
    "energy_intel_results_2021.csv",
    "energy_intel_results_2022.csv",
    "energy_intel_results.csv",
]

# List to hold DataFrames
dataframes = []

# Read each CSV file into a DataFrame and append to the list
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df

Unnamed: 0,Headline,Date
0,US Oil Demand Falls Well Short of Forecasts: EIA,"Thu, Dec 30, 2021"
1,Shell Lifts Forcados Force Majeure,"Thu, Dec 30, 2021"
2,Brent Takes Aim at $80 Heading Into New Year,"Thu, Dec 30, 2021"
3,Libya Strives to Keep Up Exports Despite Outages,"Thu, Dec 30, 2021"
4,Russia's Seaborne Exports Set to Rise in January,"Thu, Dec 30, 2021"
...,...,...
7518,Chevron to Export Venezuela Cargo,"Tue, Jan 3, 2023"
7519,"Oil, Diesel Futures Start New Year in Decline","Tue, Jan 3, 2023"
7520,"Russian Oil Output Flat in December, Exports Sink","Tue, Jan 3, 2023"
7521,Iraq's Oil Revenues Hit 12-Month Low,"Tue, Jan 3, 2023"


In [91]:
# Filter out rows with "No date" or "No headline"
combined_df = combined_df[(combined_df['Date'] != "No date") & (combined_df['Headline'] != "No headline")]

# Convert the 'Date' column to datetime format
combined_df['Date'] = pd.to_datetime(combined_df['Date'], format='%a, %b %d, %Y')  # Adjusting format to match the dataset

# Sort the DataFrame by Date to ensure chronological order
combined_df = combined_df.sort_values('Date')

# Combine headlines for duplicate dates
df_grouped = combined_df.groupby('Date', as_index=False).agg({'Headline': '. '.join})

# Reset the index for the grouped DataFrame
df_grouped.reset_index(drop=True, inplace=True)
df_grouped

Unnamed: 0,Date,Headline
0,2021-01-04,Schlumberger Optimistic for 2021 Recovery
1,2021-01-28,UAE Sour Gas Bids Deadline Pushed to February
2,2021-02-04,Chevron Makes $1.1B Bid for Noble Midstream
3,2021-02-16,Continental Expands Into Powder River Despite ...
4,2021-02-17,Shell Sells Canada Duvernay Shale Assets. Shel...
...,...,...
852,2024-10-28,Oil Prices Slide as Traders Reassess Risk Prem...
853,2024-10-29,Indian Refiners Eye Cuts to Saudi Term Volumes...
854,2024-10-30,Oil Prices Rebound as US Inventories Slide Low...
855,2024-10-31,US Product Exports Climb to New All-Time High:...


In [92]:
start_date = pd.Timestamp('2021-01-04')
end_date = pd.Timestamp('2024-10-10')

final_df = df_grouped[(df_grouped['Date'] >= start_date) & (df_grouped['Date'] <= end_date)]
final_df

Unnamed: 0,Date,Headline
0,2021-01-04,Schlumberger Optimistic for 2021 Recovery
1,2021-01-28,UAE Sour Gas Bids Deadline Pushed to February
2,2021-02-04,Chevron Makes $1.1B Bid for Noble Midstream
3,2021-02-16,Continental Expands Into Powder River Despite ...
4,2021-02-17,Shell Sells Canada Duvernay Shale Assets. Shel...
...,...,...
835,2024-10-04,US Rig Count Enters Fourth Quarter in Decline....
836,2024-10-07,Aramco Hikes November Prices for Asia Crude Ex...
837,2024-10-08,Demand Concerns Outweigh Mideast Turmoil in EI...
838,2024-10-09,"Oil Prices Dip Amid Demand, Risk Crosscurrents..."


In [95]:
full_date_range = pd.date_range(start=start_date, end=end_date)

# Create a DataFrame with the full date range
df_full = pd.DataFrame({"Date": full_date_range})

# Merge with the grouped DataFrame
df_full = df_full.merge(final_df, on="Date", how="left")

# Forward fill missing headlines
df_full["Headline"] = df_full["Headline"].fillna(method="ffill")
df_full

  df_full["Headline"] = df_full["Headline"].fillna(method="ffill")


Unnamed: 0,Date,Headline
0,2021-01-04,Schlumberger Optimistic for 2021 Recovery
1,2021-01-05,Schlumberger Optimistic for 2021 Recovery
2,2021-01-06,Schlumberger Optimistic for 2021 Recovery
3,2021-01-07,Schlumberger Optimistic for 2021 Recovery
4,2021-01-08,Schlumberger Optimistic for 2021 Recovery
...,...,...
1371,2024-10-06,US Rig Count Enters Fourth Quarter in Decline....
1372,2024-10-07,Aramco Hikes November Prices for Asia Crude Ex...
1373,2024-10-08,Demand Concerns Outweigh Mideast Turmoil in EI...
1374,2024-10-09,"Oil Prices Dip Amid Demand, Risk Crosscurrents..."


In [96]:
df_full.to_csv('final_datasets/ei_final.csv', index=False)

# OPEC

In [108]:
df = pd.read_csv("opec.csv")
df

Unnamed: 0,date,text
0,2021-12-14,Communique of the African Petroleum Producers’...
1,2021-12-10,The Declaration of Cooperation of OPEC and non...
2,2021-12-02,23rd OPEC and non-OPEC Ministerial Meeting\nNo...
3,2021-12-01,Opening address to the 182nd Meeting of the OP...
4,2021-11-30,OPEC marks the 5th anniversary of the landmark...
...,...,...
127,2024-03-19,The Seventh High-Level Meeting of the OPEC-Chi...
128,2024-03-03,Several OPEC+ countries announce extension of ...
129,2024-02-19,OPEC and Africa exchange views on array of key...
130,2024-02-01,52nd Meeting of the Joint Ministerial Monitori...


In [109]:
# Use the "date" column as the Date column
df["Date"] = pd.to_datetime(df["date"])

# Define the date range
start_date = pd.to_datetime("2021-01-04")
end_date = pd.to_datetime("2024-10-10")

# Create a complete date range
all_dates = pd.date_range(start=start_date, end=end_date)

# Filter data to only include rows within the specified date range
df = df[(df["Date"] >= start_date) & (df["Date"] <= end_date)]

# Group data by date, concatenating text with ". "
df_grouped = df.groupby("Date")["text"].apply(". ".join).reset_index()

# Create a DataFrame with the full date range
df_full = pd.DataFrame({"Date": all_dates})

# Merge with the grouped data, ensuring all dates are present
df_full = df_full.merge(df_grouped, on="Date", how="left")

# Forward fill missing text
df_full["text"] = df_full["text"].fillna(method="ffill")

# Show the final DataFrame
print(df_full.head(20))

# Save the final DataFrame
df_full.to_csv(r"final_datasets\opec_full.csv", index=False)

         Date                                               text
0  2021-01-04  The 13th OPEC and non-OPEC Ministerial Meeting...
1  2021-01-05  13th OPEC and non-OPEC Ministerial Meeting con...
2  2021-01-06  13th OPEC and non-OPEC Ministerial Meeting con...
3  2021-01-07  13th OPEC and non-OPEC Ministerial Meeting con...
4  2021-01-08  13th OPEC and non-OPEC Ministerial Meeting con...
5  2021-01-09  13th OPEC and non-OPEC Ministerial Meeting con...
6  2021-01-10  13th OPEC and non-OPEC Ministerial Meeting con...
7  2021-01-11  13th OPEC and non-OPEC Ministerial Meeting con...
8  2021-01-12  13th OPEC and non-OPEC Ministerial Meeting con...
9  2021-01-13  13th OPEC and non-OPEC Ministerial Meeting con...
10 2021-01-14  13th OPEC and non-OPEC Ministerial Meeting con...
11 2021-01-15  13th OPEC and non-OPEC Ministerial Meeting con...
12 2021-01-16  13th OPEC and non-OPEC Ministerial Meeting con...
13 2021-01-17  13th OPEC and non-OPEC Ministerial Meeting con...
14 2021-01-18  13th OPEC 

  df_full["text"] = df_full["text"].fillna(method="ffill")


In [110]:
df_full

Unnamed: 0,Date,text
0,2021-01-04,The 13th OPEC and non-OPEC Ministerial Meeting...
1,2021-01-05,13th OPEC and non-OPEC Ministerial Meeting con...
2,2021-01-06,13th OPEC and non-OPEC Ministerial Meeting con...
3,2021-01-07,13th OPEC and non-OPEC Ministerial Meeting con...
4,2021-01-08,13th OPEC and non-OPEC Ministerial Meeting con...
...,...,...
1371,2024-10-06,56th Meeting of the Joint Ministerial Monitori...
1372,2024-10-07,56th Meeting of the Joint Ministerial Monitori...
1373,2024-10-08,56th Meeting of the Joint Ministerial Monitori...
1374,2024-10-09,56th Meeting of the Joint Ministerial Monitori...


# COMBINE EVERYTHING

In [113]:
import glob

# Define the path to the directory containing the CSV files
file_path = "final_datasets/*.csv"

# Load all the CSV files
csv_files = glob.glob(file_path)

# Create an empty DataFrame to hold combined data
combined_data = pd.DataFrame()

# Iterate through each CSV file
for file in csv_files:
    df = pd.read_csv(file)
    # Ensure the 'Date' column is parsed and consistent across datasets
    df['Date'] = pd.to_datetime(df['Date'])
    source_name = file.split("\\")[-1].replace("_final.csv", "")  # Adjusted for Windows file path
    df.rename(columns={"Headline": source_name}, inplace=True)  # Rename the 'Headline' column to the source name
    if combined_data.empty:
        combined_data = df
    else:
        combined_data = pd.merge(combined_data, df, on="Date", how="outer")  # Merge on 'Date'

# Sort by date and reset the index
combined_data = combined_data.sort_values("Date").reset_index(drop=True)

# Rename the 'text' column to 'opec'
combined_data = combined_data.rename(columns={"text": "opec"})

# Save the combined DataFrame
combined_data.to_csv("combined_final_dataset.csv", index=False)
combined_data

Unnamed: 0,Date,arabnews,cnbc,ei,ft,mees,oilprice,opec
0,2021-01-04,WEEKLY ENERGY RECAP: Oil prices on an upward m...,Shares of Chinese telecom giants surge followi...,Schlumberger Optimistic for 2021 Recovery,Saudi Arabia warns oil producers over fragile ...,Opec Aims For More Stable 2021. US Mideast Cru...,Oil Rises As Market Awaits OPEC+ Production De...,The 13th OPEC and non-OPEC Ministerial Meeting...
1,2021-01-05,WEEKLY ENERGY RECAP: Oil prices on an upward m...,Oil prices will remain flat until mid-2021: JB...,Schlumberger Optimistic for 2021 Recovery,Saudis pledge to cut oil output despite Russia...,Opec Aims For More Stable 2021. US Mideast Cru...,OPEC+ Meeting Ends With Major Surprise Cut Fro...,13th OPEC and non-OPEC Ministerial Meeting con...
2,2021-01-06,WEEKLY ENERGY RECAP: Oil prices on an upward m...,Oil rises as supply constraints retain focus a...,Schlumberger Optimistic for 2021 Recovery,Oil auction in Arctic wildlife refuge draws sc...,Opec Aims For More Stable 2021. US Mideast Cru...,OPEC+ Meeting Ends With Major Surprise Cut Fro...,13th OPEC and non-OPEC Ministerial Meeting con...
3,2021-01-07,WEEKLY ENERGY RECAP: Oil prices on an upward m...,"Oil hits highest level since February, posts n...",Schlumberger Optimistic for 2021 Recovery,A Democratic Senate provides a surge for Biden...,Opec Aims For More Stable 2021. US Mideast Cru...,The Next 5 Days Could See A Buying Spree In Oi...,13th OPEC and non-OPEC Ministerial Meeting con...
4,2021-01-08,WEEKLY ENERGY RECAP: Oil prices on an upward m...,"Oil hits highest level since February, posts n...",Schlumberger Optimistic for 2021 Recovery,UK’s FTSE 100 kicks off 2021 with 6% rally lif...,Algeria’s Gas Exports Slump To 25-Year Low For...,Oil Rally Continues Despite Slow Vaccine Rollout,13th OPEC and non-OPEC Ministerial Meeting con...
...,...,...,...,...,...,...,...,...
1371,2024-10-06,Oman credit rating rises to ‘BBB-’ from ‘BB+’:...,Oil pares gains after strongest weekly rise in...,US Rig Count Enters Fourth Quarter in Decline....,TotalEnergies considers foray into copper trad...,Compliance In Spotlight At Opec JMMC Meeting,Oil Explodes 4% Amid Talk of Israel Attacking ...,56th Meeting of the Joint Ministerial Monitori...
1372,2024-10-07,Oman credit rating rises to ‘BBB-’ from ‘BB+’:...,Oil edges lower after rallying to highest in o...,Aramco Hikes November Prices for Asia Crude Ex...,Oil surges above $80 as conflict and storm spa...,Compliance In Spotlight At Opec JMMC Meeting,Historic Short Squeeze Sends Oil Prices Higher,56th Meeting of the Joint Ministerial Monitori...
1373,2024-10-08,"Pakistan’s stock market surpasses historic 85,...",Oil prices steady after sliding on potential I...,Demand Concerns Outweigh Mideast Turmoil in EI...,Chevron in talks to sell Texas gas assets to T...,Compliance In Spotlight At Opec JMMC Meeting,Historic Short Squeeze Sends Oil Prices Higher,56th Meeting of the Joint Ministerial Monitori...
1374,2024-10-09,"Pakistan’s stock market surpasses historic 85,...",Oil prices rise more than 3% on Israel-Iran co...,"Oil Prices Dip Amid Demand, Risk Crosscurrents...",Who and what is driving oil price volatility. ...,Compliance In Spotlight At Opec JMMC Meeting,Wild Oil Price Forecasts: Some Predict $350 if...,56th Meeting of the Joint Ministerial Monitori...
