### Import modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Set number of columns to show 
pd.set_option('display.max_rows', 100)
# plt.style.use("ggplot")
# pd.set_option('max_columns', None)

### Read CSV file

In [2]:
df = pd.read_csv('space_missions.csv', encoding='latin-1')

In [3]:
df.head(13)

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,Price,MissionStatus
0,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-10-04,19:28:00,Sputnik 8K71PS,Sputnik-1,Retired,,Success
1,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-11-03,02:30:00,Sputnik 8K71PS,Sputnik-2,Retired,,Success
2,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1957-12-06,16:44:00,Vanguard,Vanguard TV3,Retired,,Failure
3,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-02-01,03:48:00,Juno I,Explorer 1,Retired,,Success
4,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-02-05,07:33:00,Vanguard,Vanguard TV3BU,Retired,,Failure
5,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-03-05,18:27:00,Juno I,Explorer 2,Retired,,Failure
6,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-03-17,12:15:00,Vanguard,Vanguard 1,Retired,,Success
7,AMBA,"LC-5, Cape Canaveral AFS, Florida, USA",1958-03-26,17:38:00,Juno I,Explorer 3,Retired,,Success
8,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1958-04-27,09:01:00,Sputnik 8A91,Sputnik-3 #1,Retired,,Failure
9,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-04-28,02:53:00,Vanguard,Vanguard TV5,Retired,,Failure


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4630 entries, 0 to 4629
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company        4630 non-null   object
 1   Location       4630 non-null   object
 2   Date           4630 non-null   object
 3   Time           4503 non-null   object
 4   Rocket         4630 non-null   object
 5   Mission        4630 non-null   object
 6   RocketStatus   4630 non-null   object
 7   Price          1265 non-null   object
 8   MissionStatus  4630 non-null   object
dtypes: object(9)
memory usage: 325.7+ KB


### Data preparation Functions


In [5]:
# Convert data type function
def set_data(df):
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    df['Time'] = pd.to_datetime(df['Time'].astype(str))
    df['Price'] = pd.to_numeric(df['Price'].str.replace(',', ''))
#   df['RocketStatus'] = df['RocketStatus'].astype('bool') <=== Optional
    df['Year'] = pd.to_datetime(df['Date']).dt.year
    df['Country'] = df['Location'].apply(lambda x: x.split(',')[-1].strip())
    return df

# Duplicate data function
def remove_deuplicate(df):
    df = df.loc[~df.duplicated(subset= ['Date', 'Rocket', 'Mission'])].reset_index(drop=True)
    return df

# Remove None values 
def remove_none(df):
    df['Price'] = df['Price'].replace(',', '').fillna('0')
    df['Time'] = df['Time'].replace('0', '00:00:00')
    return df

### Initiate Data preparation


In [6]:
df = remove_none(df)

In [7]:
df.isna().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4630 entries, 0 to 4629
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company        4630 non-null   object
 1   Location       4630 non-null   object
 2   Date           4630 non-null   object
 3   Time           4503 non-null   object
 4   Rocket         4630 non-null   object
 5   Mission        4630 non-null   object
 6   RocketStatus   4630 non-null   object
 7   Price          4630 non-null   object
 8   MissionStatus  4630 non-null   object
dtypes: object(9)
memory usage: 325.7+ KB


In [8]:
df = remove_deuplicate(df)

In [9]:
df.duplicated().sum()

0

In [10]:
df.shape

(4629, 9)

In [11]:
df = set_data(df)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4629 entries, 0 to 4628
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Company        4629 non-null   object        
 1   Location       4629 non-null   object        
 2   Date           4629 non-null   object        
 3   Time           4502 non-null   datetime64[ns]
 4   Rocket         4629 non-null   object        
 5   Mission        4629 non-null   object        
 6   RocketStatus   4629 non-null   object        
 7   Price          4629 non-null   float64       
 8   MissionStatus  4629 non-null   object        
 9   Year           4629 non-null   int64         
 10  Country        4629 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(8)
memory usage: 397.9+ KB


# General Statistics

In [13]:
df.head()

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,Price,MissionStatus,Year,Country
0,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-10-04,2023-03-08 19:28:00,Sputnik 8K71PS,Sputnik-1,Retired,0.0,Success,1957,Kazakhstan
1,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-11-03,2023-03-08 02:30:00,Sputnik 8K71PS,Sputnik-2,Retired,0.0,Success,1957,Kazakhstan
2,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1957-12-06,2023-03-08 16:44:00,Vanguard,Vanguard TV3,Retired,0.0,Failure,1957,USA
3,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-02-01,2023-03-08 03:48:00,Juno I,Explorer 1,Retired,0.0,Success,1958,USA
4,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-02-05,2023-03-08 07:33:00,Vanguard,Vanguard TV3BU,Retired,0.0,Failure,1958,USA


In [14]:

top = df.head(10)
top

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,Price,MissionStatus,Year,Country
0,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-10-04,2023-03-08 19:28:00,Sputnik 8K71PS,Sputnik-1,Retired,0.0,Success,1957,Kazakhstan
1,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-11-03,2023-03-08 02:30:00,Sputnik 8K71PS,Sputnik-2,Retired,0.0,Success,1957,Kazakhstan
2,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1957-12-06,2023-03-08 16:44:00,Vanguard,Vanguard TV3,Retired,0.0,Failure,1957,USA
3,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-02-01,2023-03-08 03:48:00,Juno I,Explorer 1,Retired,0.0,Success,1958,USA
4,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-02-05,2023-03-08 07:33:00,Vanguard,Vanguard TV3BU,Retired,0.0,Failure,1958,USA
5,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-03-05,2023-03-08 18:27:00,Juno I,Explorer 2,Retired,0.0,Failure,1958,USA
6,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-03-17,2023-03-08 12:15:00,Vanguard,Vanguard 1,Retired,0.0,Success,1958,USA
7,AMBA,"LC-5, Cape Canaveral AFS, Florida, USA",1958-03-26,2023-03-08 17:38:00,Juno I,Explorer 3,Retired,0.0,Success,1958,USA
8,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1958-04-27,2023-03-08 09:01:00,Sputnik 8A91,Sputnik-3 #1,Retired,0.0,Failure,1958,Kazakhstan
9,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-04-28,2023-03-08 02:53:00,Vanguard,Vanguard TV5,Retired,0.0,Failure,1958,USA


In [15]:
### Get latitudes and longitudes
from geopy.geocoders import Nominatim

def get_lat_long(locations):
    # Create geolocator object
    geolocator = Nominatim(user_agent="my_app")

    # Create empty lists to store latitudes and longitudes
    latitudes = []
    longitudes = []

    # Iterate through locations
    for location in locations:
        # Use geolocator to get latitude and longitude
        try:
            location = geolocator.geocode(location, timeout=10)
            latitudes.append(location.latitude)
            longitudes.append(location.longitude)
        except:
            latitudes.append(None)
            longitudes.append(None)
        
    top['Latitude'] = latitudes
    top['Longitude'] = longitudes

    return

get_lat_long(top['Country'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top['Latitude'] = latitudes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top['Longitude'] = longitudes


In [16]:
top

Unnamed: 0,Company,Location,Date,Time,Rocket,Mission,RocketStatus,Price,MissionStatus,Year,Country,Latitude,Longitude
0,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-10-04,2023-03-08 19:28:00,Sputnik 8K71PS,Sputnik-1,Retired,0.0,Success,1957,Kazakhstan,48.101295,66.778082
1,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1957-11-03,2023-03-08 02:30:00,Sputnik 8K71PS,Sputnik-2,Retired,0.0,Success,1957,Kazakhstan,48.101295,66.778082
2,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1957-12-06,2023-03-08 16:44:00,Vanguard,Vanguard TV3,Retired,0.0,Failure,1957,USA,39.78373,-100.445882
3,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-02-01,2023-03-08 03:48:00,Juno I,Explorer 1,Retired,0.0,Success,1958,USA,39.78373,-100.445882
4,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-02-05,2023-03-08 07:33:00,Vanguard,Vanguard TV3BU,Retired,0.0,Failure,1958,USA,39.78373,-100.445882
5,AMBA,"LC-26A, Cape Canaveral AFS, Florida, USA",1958-03-05,2023-03-08 18:27:00,Juno I,Explorer 2,Retired,0.0,Failure,1958,USA,39.78373,-100.445882
6,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-03-17,2023-03-08 12:15:00,Vanguard,Vanguard 1,Retired,0.0,Success,1958,USA,39.78373,-100.445882
7,AMBA,"LC-5, Cape Canaveral AFS, Florida, USA",1958-03-26,2023-03-08 17:38:00,Juno I,Explorer 3,Retired,0.0,Success,1958,USA,39.78373,-100.445882
8,RVSN USSR,"Site 1/5, Baikonur Cosmodrome, Kazakhstan",1958-04-27,2023-03-08 09:01:00,Sputnik 8A91,Sputnik-3 #1,Retired,0.0,Failure,1958,Kazakhstan,48.101295,66.778082
9,US Navy,"LC-18A, Cape Canaveral AFS, Florida, USA",1958-04-28,2023-03-08 02:53:00,Vanguard,Vanguard TV5,Retired,0.0,Failure,1958,USA,39.78373,-100.445882


In [17]:
## map

In [19]:
import folium
from geopy.geocoders import Nominatim

# create a map centered on the first location in the DataFrame
geolocator = Nominatim(user_agent='my_map')
location = geolocator.geocode(df['Country'][0])
map = folium.Map(location=[location.latitude, location.longitude], zoom_start=2)

# add markers for each location in the DataFrame
for loc in df['Country']:
    try:
        location = geolocator.geocode(loc)
        folium.Marker(location=[location.latitude, location.longitude], popup=loc).add_to(map)
    except:
        continue

# display the map
map

In [None]:
mperyr = df.groupby(['Year'])['Mission'].count().plot(figsize= (10,5))

In [None]:
df.groupby(['MissionStatus'])['Mission'].count().plot(kind='bar',figsize= (5,5))

In [None]:
grouped = df.groupby(['Year', 'Country'])['Mission'].count().reset_index()

# Pivot the data so that each country has a separate column
pivot = grouped.pivot(index='Year', columns='Country', values='Mission').fillna(0)

# Plot the data
plt.figure(figsize=(20,10))
for c in pivot.columns:
    plt.plot(pivot.index, pivot[c], label=c)
plt.legend()
plt.xlabel('Year')
plt.ylabel('Number of Missions')
plt.title('Space Mission Trends from 1957 to 2022')
plt.show()

In [None]:
no_of_mission=df['Year'].value_counts().sort_index().head(100).plot(kind='bar', 
                                                                    figsize=(20,10), label='Number of Space Missions',
                                                                    title='Number of Sapce missions each year from 1957 to 2022')
plt.legend()
plt.show()

In [None]:
no_of_mission=df['Country'].value_counts().head(10).sort_values().plot(kind='barh',
                                                                       figsize=(20,10), 
                                                                       label='Number of Space Missions',
                                                                       title='Top Countries with Most Missions', xlabel='Years', ylabel='Countries' )      
plt.legend()
plt.show()

In [None]:
# Top 5 Countries with high space missions
df['Country'].value_counts().head(5).plot(
    kind='bar', 
    figsize=(10,5), 
    title ='Top 5 Countries in Space Race')

plt.show()

# USA SPACE STATS

In [None]:
usa = df['Location'].str[-3:].isin(['USA'])
df['Country'].str.strip(',').isin(['USA'])

In [None]:
# USA Top Space agencies
usa = df.loc[df['Country'].str.strip(',').isin(['USA'])].reset_index(drop=True)
usa.groupby(usa['Company'])['Mission'].count().sort_values(ascending=False).head(10).plot(
    kind='bar', 
    figsize=(7,5),
    xlabel='Companies',
    ylabel='Number of Missions',
    title='Top 5 active Space agencies in USA')

In [None]:
# General Dynamics statistics
gdynamics = usa.query('Company == "General Dynamics"')
gdynamics.groupby(['Year'])['Mission'].count().plot(
    kind='bar', 
    figsize=(10,5), title='General Dynamics Missions Timeline')

plt.xticks(rotation=90)


# Show the plot
plt.show()


In [None]:
# Nasa Statistics
nasa = usa.query('Company == "NASA"')
nasa.groupby(['Year'])['Mission'].count().plot(
    kind='bar', 
    figsize=(10,5), 
    title='NASA Missions Timeline')

plt.xticks(rotation=90)


# Show the plot
plt.show()



# Russian and Kazakhstan

In [None]:
# Russian and Kazakhstan
russiak = df.loc[df['Country'].isin(['Russia', 'Kazakhstan'])].reset_index(drop=True)
russiak.groupby(russia['Company'])['Mission'].count().sort_values(ascending=False).head(5).plot(
    kind='bar', 
    figsize=(7,5),
    xlabel='Companies',
    ylabel='Number of Missions',
    title='Top 5 active Space agencies in Russia and Kazakhstan')

In [None]:
# Mission timeline # Russia and Kazakhstan
russiaKm = df.loc[df['Country'].isin(['Russia', 'Kazakhstan'])].reset_index(drop=True)
russiaKm.groupby(russia['Year'])['Mission'].count().head(100).plot(
    kind='bar', 
    figsize=(10,5),
    xlabel='Companies',
    ylabel='Number of Missions',
    title='Russia and Kazakhstan Space Mission Timeline')
plt.show()

In [None]:
# USA
USA = df.query('Company == "SpaceX"|Company =="Blue Origin"|Company =="US Air Force"|Company =="US Navy"').reset_index(drop=True)
USA.shape

In [None]:
# Number of Space Missions in USA and Russia

# Filter the data for companies from the USA
usa_df = df[df['Location'].str.contains('USA')]
plt.figure(figsize=(20, 5))
# Filter the data for companies from Russia
russia_df = df[df['Location'].str.contains('Russia')]

# Count the number of missions for each company in the USA data
usa_mission_counts = usa_df['Year'].value_counts().sort_index()


# Count the number of missions for each company in the Russia data
russia_mission_counts = russia_df['Year'].value_counts().sort_index()


# Plot a bar graph of the mission counts for the USA companies
usa_mission_counts.plot(kind='bar', color='blue', alpha=0.5, label='USA', xlabel='', ylabel='Number of Missions')

# Plot a bar graph of the mission counts for the Russia companies on the same plot
russia_mission_counts.plot(kind='bar', color='red', alpha=0.5, label='Russia', xlabel='', ylabel='Number of Missions')

# Add a legend to the plot
plt.legend()
plt.title('USA Vs Russia Space Missions comparision')

# Show the plot
plt.show()



In [None]:
# Group the data by country
grouped = df.groupby(df['Location'].str.contains('USA').map({True: 'USA', False: 'Russia'}))

# Count the number of missions for each country
mission_counts = grouped['Company'].count()

# Plot a bar graph of the mission counts for each country
mission_counts.plot(kind='bar', color=['blue', 'red'], xlabel='Country', ylabel='Number of Missions')

# Show the plot
plt.show()

# Most Active Sites

In [None]:

# Count the number of missions for each country
site_mission_counts = df['Location'].value_counts()

# Select the top 10 countries
top_10_site = site_mission_counts.head(10)

# Sort the top 10 countries by number of missions
top_10_site = top_10_site.sort_values(ascending=True)

# Set the figure size
plt.figure(figsize=(10, 6))

# Plot a bar graph of the mission counts for the top 10 countries
top_10_site.plot(kind='barh', xlabel='Missions', ylabel='Locations', title='Top 10 Sites with Most Missions')

# Show the plot
plt.show()

In [None]:
usa_df = df[df["Location"].str.contains("USA")]

# Use the value_counts method to determine the most common locations
most_common_locations = usa_df["Location"].value_counts().nlargest(10).sort_values()

# Plot the results
fig, ax = plt.subplots()
most_common_locations.plot(kind="barh", ax=ax, figsize=(10,5))
ax.set_ylabel("Number of Missions")
plt.title("Top Space Mission launch site in the USA")
plt.show()

In [None]:
# Filter the data to only include locations in Russia
russia_df = df.loc[df['Location'].isin(['Russia', 'Kazakhstan'])]

# Use the value_counts method to determine the most common locations
most_common_locations = russia_df["Location"].value_counts().sort_values(ascending=False).nlargest(10)
most_common_locations.head()
# Plot the results
# fig, ax = plt.subplots()
# most_common_locations.plot(kind="bar", ax=ax, figsize=(10,5))
# ax.set_ylabel("Number of Missions")
# plt.title("Top Used Space mission launch site in Russia")
# plt.show()

In [None]:
# Mission Status Comparision
x = df['Year']
y = df['MissionStatus']
plt.scatter(x, y, s=(7), vmin=None, vmax=None, label='Missions', linewidths=0.3)
plt.xlabel('Year')
plt.ylabel('Status')
plt.title('Outcome Comparisions')
plt.legend()
plt.show()

In [None]:
toproc = df['Rocket'].value_counts().head(10).sort_values().plot(kind= 'barh', 
                                                                 fontsize=5, figsize= (5,10))
toproc.set_ylabel('Models')
# toproc.set_ylabel('Number of Missions')

In [None]:
df['Company'].value_counts().head(10).sort_values().plot(kind='barh', title='Number of Space Mission by Agencies')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,12))
scatter = ax.scatter(df['Year'], df['Company'], alpha=0.3)

plt.show()