In [None]:
# Imports for EDA

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

In [None]:
# Loading data w/ encoding fix

df = pd.read_csv('../data/raw/startups_data.csv', encoding='latin-1')

In [None]:
# Some brief early exploration - rows, columns, column names, first 5 rows 

print(f"Dataset Shape: {df.shape}")
print(f"Columns {df.columns.tolist()}")
print(df.head())


In [None]:
# Data types, memory usage, amount of non-null values per column

print(df.info())

In [None]:
# Ignoring warnings
warnings.filterwarnings('ignore')

# Plotting style
plt.style.use('default')
sns.set_palette("husl")

In [None]:
# Early Data Quality Assessment 

print("Early Data Quality Assessment")

missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing_Count': missing_data.values,
    'Missing_Percentage': missing_percent.values
}).sort_values('Missing_Percentage', ascending=False)

print("\nTop 10 Columns with Most Missing Values:")
print(missing_df.head(10))

# Visualize missing data pattern

plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1)
missing_df.head(15).plot(x='Column', y='Missing_Percentage', kind='bar')
plt.title('Missing Data by Column')
plt.xticks(rotation=45)
plt.ylabel('Missing Percentage')

# Checking for duplicate companies

duplicates = df.duplicated(subset=['name']).sum()
print(f"Duplicate company names: {duplicates}")

# Data type issues

print(f"Data Types Summary:")
print(df.dtypes.value_counts())


In [None]:
# Duplicate Company Investigation

# Some duplicate examples

duplicate_names = df[df.duplicated(subset=['name'], keep=False)]['name'].value_counts().head(10)
print("Most frequent duplicate company names:")
print(duplicate_names)

# Checking if duplicates have different data respectively

sample_duplicate = duplicate_names.index[0]
duplicate_rows = df[df['name'] == sample_duplicate]
print(f"\nExample: All rows for '{sample_duplicate}':")
print(duplicate_rows[['name', 'status', 'founded_year', ' funding_total_usd ', 'country_code']].head())