In [None]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the scraped data from CSV
df = pd.read_csv('data/german_data_startups_sample.csv')

# Show the first few rows of the DataFrame
df.head()

In [None]:
# Check for missing values and calculate the percentage
missing_data = df.isna().sum() / len(df) * 100

# Display missing data as a percentage
missing_data

In [None]:
# Count how many startups are in each city, ignore empty values, sort from highest to lowest
city_counts = df['Based in'].dropna().value_counts().sort_values(ascending=False)

# Keep only the top 8 cities with the most startups
top_cities = city_counts.head(8)

# Set the size of the figure
plt.figure(figsize=(10, 5))

# Create a bar chart of the top 8 cities
top_cities.plot(kind='bar', color='skyblue', edgecolor='black')

# Set the title and axis labels
plt.title("Number of Startups per City")
plt.xlabel('City')
plt.ylabel('Total startups')

# Rotate the city names on the x-axis for better readability
plt.xticks(rotation=45, ha='right')

# Adjust layout to fit everything nicely
plt.tight_layout()

# Add horizontal grid lines to make the chart easier to read
plt.grid(axis='y')

# Show the chart
plt.show()

In [None]:
# Replace empty values or NaN in the 'LinkedIn' column with 'no info'
df['LinkedIn'] = df['LinkedIn'].replace('', 'no info').fillna('no info')

# Create a new column to classify LinkedIn presence
linkedin_counts = df['LinkedIn'].apply(lambda x: 'Has LinkedIn' if x != 'no info' else 'No LinkedIn').value_counts()

# Set the size of the donut chart
plt.figure(figsize=(6, 6))

# Set the colors for the chart
colors = ['#4CAF50', '#FFC107']

# Create a donut chart (pie chart with a hole in the middle)
wedges, texts, autotexts = plt.pie(
    linkedin_counts,               
    labels=linkedin_counts.index,   # Labels: Has LinkedIn / No LinkedIn
    autopct='%1.1f%%',              
    colors=colors,                 
    startangle=90,                  
    wedgeprops={'width': 0.4}       
)

# Add a title to the chart
plt.title("LinkedIn Presence in Startups")

# Adjust layout to fit everything nicely
plt.tight_layout()

# Display the chart
plt.show()

In [None]:
# Convert 'Founded' column to numeric, ignoring errors (some values may be empty or non-numeric)
df['Founded'] = pd.to_numeric(df['Founded'], errors='coerce')

# Filter startups founded after 2015
founded_after_2015 = df[df['Founded'] > 2015].shape[0]
total_startups = df.shape[0]

# Calculate the percentage
percentage_founded_after_2015 = (founded_after_2015 / total_startups) * 100
percentage_founded_after_2015