In [32]:
# Import necessary libraries
import pandas as pd
from datetime import datetime

In [33]:
# Read the CSV files into DataFrames
company_df = pd.read_csv('../Data/company_info.csv')
stock_df = pd.read_csv('../Data/company_stock_details.csv')

print(f"Company data shape: {company_df.shape}")
print(f"Stock data shape: {stock_df.shape}")

Company data shape: (524, 4)
Stock data shape: (217811, 18)


In [34]:
missing_company = company_df.isnull().any(axis=1).sum()
print(f"Number of rows in company with missing data: {missing_company}")

missing_stock = stock_df.isnull().any(axis=1).sum()
print(f"Number of rows in stock with missing data: {missing_stock}")

# Since the number of rows with missing data is small, we can drop them
stock_df = stock_df.dropna()

# confirming the number of rows after dropping missing data
print(f"Stock data shape after dropping missing rows: {stock_df.shape}")

Number of rows in company with missing data: 0
Number of rows in stock with missing data: 493
Stock data shape after dropping missing rows: (217318, 18)


In [35]:
# Count the number of unique companies with the frequencies of their stock data
print(stock_df['Symbol'].value_counts())
# Count the number of unique companies in the company DataFrame
print(f"Number of unique companies in company DataFrame: {company_df['Symbol'].nunique()}")

Symbol
MMM     440
NRG     440
NLOK    440
NOC     440
NTRS    440
       ... 
EMN     440
DXC     440
DD      440
OGN     285
CEG     113
Name: count, Length: 495, dtype: int64
Number of unique companies in company DataFrame: 497


In [36]:
# Let's merge the two DataFrames on the 'Symbol' column
merged_df = pd.merge(company_df, stock_df, on='Symbol', how='inner')

# check the shape of the merged DataFrame
print(f"Merged DataFrame shape: {merged_df.shape}")

# check for missing values in the merged DataFrame
missing_merged = merged_df.isnull().any(axis=1).sum()
print(f"Number of rows in merged DataFrame with missing data: {missing_merged}")

# Remove duplicate rows
merged_df = merged_df.drop_duplicates()

Merged DataFrame shape: (229198, 21)
Number of rows in merged DataFrame with missing data: 0


In [37]:
def detect_format(date_str):
    # Try a list of common formats
    formats = [
        "%Y-%m-%d", "%d-%m-%Y", "%m-%d-%Y",
        "%d/%m/%Y", "%m/%d/%Y", "%Y/%m/%d",
        "%Y.%m.%d", "%d.%m.%Y", "%m.%d.%Y"
    ]
    for fmt in formats:
        try:
            datetime.strptime(date_str, fmt)
            return fmt
        except ValueError:
            continue
    return "Unknown format"

# Ensure all entries are strings
date_strings = merged_df['Date'].dropna()

# Apply format detection and count
format_counts = date_strings.apply(detect_format).value_counts()

# Display the result
print(format_counts)

Date
%d/%m/%Y    217318
Name: count, dtype: int64


In [38]:
# Save the merged DataFrame to a new CSV file
merged_df.to_csv('../Data/merged.csv', index=False)