# 🧠 Top SaaS Companies Case Study
Data Analysis using Pandas and Matplotlib

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_excel('p1 stu2.xlsx')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'p1 stu2.xlsx'

## 1. Distribution of Companies by Industry

In [None]:
industry_counts = df['Industry'].value_counts()
industry_counts.plot(kind='bar', title='Company Distribution by Industry')
plt.xlabel('Industry')
plt.ylabel('Number of Companies')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

## 2. Founded Year Distribution

In [None]:
df['Founded Year'].hist(bins=20)
plt.title('Founded Year Distribution')
plt.xlabel('Year')
plt.ylabel('Number of Companies')
plt.grid(True)
plt.show()

## 3. Funding and Valuation Summary (Converted to Numeric)

In [None]:

# Helper to convert string like "$1B" to numeric
def convert_money(val):
    if pd.isna(val): return None
    val = str(val).replace('$', '').replace(',', '').strip()
    if 'B' in val: return float(val.replace('B', '')) * 1e9
    if 'M' in val: return float(val.replace('M', '')) * 1e6
    if 'K' in val: return float(val.replace('K', '')) * 1e3
    try: return float(val)
    except: return None

df['Funding_num'] = df['Total Funding'].apply(convert_money)
df['Valuation_num'] = df['Valuation'].apply(convert_money)
df['ARR_num'] = df['ARR'].apply(convert_money)

print(df[['Funding_num', 'Valuation_num', 'ARR_num']].describe())


## 4. Employees vs ARR and Valuation

In [None]:

plt.scatter(df['Employees'], df['ARR_num'])
plt.title('Employees vs ARR')
plt.xlabel('Employees')
plt.ylabel('ARR (USD)')
plt.grid(True)
plt.show()

plt.scatter(df['Employees'], df['Valuation_num'])
plt.title('Employees vs Valuation')
plt.xlabel('Employees')
plt.ylabel('Valuation (USD)')
plt.grid(True)
plt.show()


## 5. HQ Locations of Companies

In [None]:
hq_counts = df['HQ'].value_counts().head(10)
hq_counts.plot(kind='barh', title='Top HQ Locations')
plt.xlabel('Number of Companies')
plt.tight_layout()
plt.grid(True)
plt.show()

## 6. Top Investors Frequency

In [None]:

# Split by commas and flatten
from collections import Counter
investor_series = df['Top Investors'].dropna().str.split(', ')
all_investors = [inv for sublist in investor_series for inv in sublist]
top_investor_counts = pd.Series(Counter(all_investors)).sort_values(ascending=False)
print(top_investor_counts.head(10))


## 7. Summary and Key Insights

In [None]:

print("""Key Insights:
- Certain industries (e.g., Enterprise Software) dominate the list.
- Top SaaS companies are often older firms but growing in ARR.
- Valuation and ARR tend to increase with employee count.
- Key investors appear across multiple top-performing companies.
""")
