In [2]:
import pandas as pd
import plotly.express as px

In [3]:
import os

current_dir = os.getcwd()
root_dir = os.path.dirname(current_dir)
file_path = os.path.join(root_dir, "data", "database.csv")
df = pd.read_csv(file_path)

In [3]:
pd.options.display.max_colwidth=1000
df.sample(n=5)

Unnamed: 0,Company Name,Industry,Market Capitalization,Revenue (in millions),EBITDA (in millions),Net Income (in millions),Debt (in millions),Equity (in millions),Enterprise Value (in millions),P/E Ratio,Revenue Growth Rate (%),EBITDA Margin (%),Net Income Margin (%),ROE (Return on Equity) (%),ROA (Return on Assets) (%),Current Ratio,Debt to Equity Ratio,Location
1,HealthInc,Healthcare,3000,1000,250,80,150,600,3150,15,12,25.0,8.0,13.33,10.0,2.0,0.25,New York
2,RetailCo,Retail,2000,800,150,40,100,400,2100,20,8,18.75,5.0,10.0,6.5,1.8,0.25,Chicago
3,FinanceLLC,Financial Services,4000,1200,400,150,300,1000,4300,18,15,33.33,12.5,20.0,12.0,3.0,0.3,Boston
0,TechCorp,Technology,5000,1500,300,100,200,800,5400,25,10,20.0,6.67,12.5,7.5,2.5,0.25,San Francisco
4,ManuCorp,Manufacturing,2500,900,200,60,200,500,2700,22,9,22.22,6.67,12.0,8.0,2.2,0.4,Dalla


In [4]:
# Display the data types of each column
print("Data Types:\n", df.dtypes)

# Get summary statistics for the dataset
print("\nSummary Statistics:\n", df.describe(include='all'))

Data Types:
 Company Name                       object
Industry                           object
Market Capitalization               int64
Revenue (in millions)               int64
EBITDA (in millions)                int64
Net Income (in millions)            int64
Debt (in millions)                  int64
Equity (in millions)                int64
Enterprise Value (in millions)      int64
P/E Ratio                           int64
Revenue Growth Rate (%)             int64
EBITDA Margin (%)                 float64
Net Income Margin (%)             float64
ROE (Return on Equity) (%)        float64
ROA (Return on Assets) (%)        float64
Current Ratio                     float64
Debt to Equity Ratio              float64
Location                           object
dtype: object

Summary Statistics:
        Company Name    Industry  Market Capitalization  Revenue (in millions)  \
count             5           5               5.000000               5.000000   
unique            5           5  

In [4]:
# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values:\n", missing_values)


Missing Values:
 Company Name                      0
Industry                          0
Market Capitalization             0
Revenue (in millions)             0
EBITDA (in millions)              0
Net Income (in millions)          0
Debt (in millions)                0
Equity (in millions)              0
Enterprise Value (in millions)    0
P/E Ratio                         0
Revenue Growth Rate (%)           0
EBITDA Margin (%)                 0
Net Income Margin (%)             0
ROE (Return on Equity) (%)        0
ROA (Return on Assets) (%)        0
Current Ratio                     0
Debt to Equity Ratio              0
Location                          0
dtype: int64


In [5]:
# Analyze the distribution of categorical columns
categorical_columns = ['Company Name', 'Industry', 'Location']
for col in categorical_columns:
    print(f"\nDistribution of {col}:\n", df[col].value_counts())


Distribution of Company Name:
 Company Name
TechCorp      1
HealthInc     1
RetailCo      1
FinanceLLC    1
ManuCorp      1
Name: count, dtype: int64

Distribution of Industry:
 Industry
Technology            1
Healthcare            1
Retail                1
Financial Services    1
Manufacturing         1
Name: count, dtype: int64

Distribution of Location:
 Location
San Francisco    1
New York         1
Chicago          1
Boston           1
Dalla            1
Name: count, dtype: int64


In [6]:
# Analyze numerical columns
numerical_columns = ['Revenue (in millions)']
for col in numerical_columns:
    print(f"\nSummary of {col}:\n", df[col].describe())


Summary of Revenue (in millions):
 count       5.000000
mean     1080.000000
std       277.488739
min       800.000000
25%       900.000000
50%      1000.000000
75%      1200.000000
max      1500.000000
Name: Revenue (in millions), dtype: float64


In [7]:
# Interactive bar chart for Company Name distribution
fig_Company_Name = px.bar(df, x='Company Name', title='Distribution of Company Name')

# Interactive bar chart for Industry distribution
fig_Industry = px.bar(df, x='Industry', title='Distribution of Industry')

# Interactive bar chart for Locations distribution
fig_Location = px.bar(df, x='Location', title='Locations')


In [8]:
fig_Company_Name.show()

In [9]:
fig_Industry.show()

In [10]:

# Interactive bar chart for Compliance Risk Level distribution
fig_Revenue_in_millions = px.bar(df, x='Revenue (in millions)', title='Distribution of Revenue')


In [12]:
fig_Revenue_in_millions.show()

In [13]:
df.fillna('N/A', inplace=True)

In [14]:
df.head()

Unnamed: 0,Company Name,Industry,Market Capitalization,Revenue (in millions),EBITDA (in millions),Net Income (in millions),Debt (in millions),Equity (in millions),Enterprise Value (in millions),P/E Ratio,Revenue Growth Rate (%),EBITDA Margin (%),Net Income Margin (%),ROE (Return on Equity) (%),ROA (Return on Assets) (%),Current Ratio,Debt to Equity Ratio,Location
0,TechCorp,Technology,5000,1500,300,100,200,800,5400,25,10,20.0,6.67,12.5,7.5,2.5,0.25,San Francisco
1,HealthInc,Healthcare,3000,1000,250,80,150,600,3150,15,12,25.0,8.0,13.33,10.0,2.0,0.25,New York
2,RetailCo,Retail,2000,800,150,40,100,400,2100,20,8,18.75,5.0,10.0,6.5,1.8,0.25,Chicago
3,FinanceLLC,Financial Services,4000,1200,400,150,300,1000,4300,18,15,33.33,12.5,20.0,12.0,3.0,0.3,Boston
4,ManuCorp,Manufacturing,2500,900,200,60,200,500,2700,22,9,22.22,6.67,12.0,8.0,2.2,0.4,Dalla


In [22]:
import re
def clean_text(text):
    if isinstance(text, str):
        text = text.upper()
        text = re.sub(r'[^\w\s]', '', text)
        text = text.strip()
    return text

text_columns = ['Company Name', 'Industry', 'Location']
for col in text_columns:
    df[col] = df[col].apply(clean_text)

In [23]:
df

Unnamed: 0,Company Name,Industry,Market Capitalization,Revenue (in millions),EBITDA (in millions),Net Income (in millions),Debt (in millions),Equity (in millions),Enterprise Value (in millions),P/E Ratio,Revenue Growth Rate (%),EBITDA Margin (%),Net Income Margin (%),ROE (Return on Equity) (%),ROA (Return on Assets) (%),Current Ratio,Debt to Equity Ratio,Location
0,TECHCORP,TECHNOLOGY,5000,1500,300,100,200,800,5400,25,10,20.0,6.67,12.5,7.5,2.5,0.25,SAN FRANCISCO
1,HEALTHINC,HEALTHCARE,3000,1000,250,80,150,600,3150,15,12,25.0,8.0,13.33,10.0,2.0,0.25,NEW YORK
2,RETAILCO,RETAIL,2000,800,150,40,100,400,2100,20,8,18.75,5.0,10.0,6.5,1.8,0.25,CHICAGO
3,FINANCELLC,FINANCIAL SERVICES,4000,1200,400,150,300,1000,4300,18,15,33.33,12.5,20.0,12.0,3.0,0.3,BOSTON
4,MANUCORP,MANUFACTURING,2500,900,200,60,200,500,2700,22,9,22.22,6.67,12.0,8.0,2.2,0.4,DALLA


In [24]:
# Interactive pair plot for relationships between 'Company Name', 'Industry', and Location
fig_relationships = px.scatter_matrix(df, dimensions=['Company Name', 'Industry'], color='Location', title='Relationships between Company Name, Industry, and Location')


In [25]:
fig_relationships.show()