In [1]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
df = pd.read_csv("space.csv")

# Display the first few rows of the dataframe to understand its structure
print(df.head())

# Check the data types and missing values
print(df.info())

# Summary statistics
print(df.describe())

# Check the shape of the DataFrame
print(df.shape)

   Unnamed: 0.1  Unnamed: 0 Company Name  \
0             0           0       SpaceX   
1             1           1         CASC   
2             2           2       SpaceX   
3             3           3    Roscosmos   
4             4           4          ULA   

                                            Location  \
0         LC-39A, Kennedy Space Center, Florida, USA   
1  Site 9401 (SLS-2), Jiuquan Satellite Launch Ce...   
2                      Pad A, Boca Chica, Texas, USA   
3       Site 200/39, Baikonur Cosmodrome, Kazakhstan   
4           SLC-41, Cape Canaveral AFS, Florida, USA   

                        Datum                                        Detail  \
0  Fri Aug 07, 2020 05:12 UTC  Falcon 9 Block 5 | Starlink V1 L9 & BlackSky   
1  Thu Aug 06, 2020 04:01 UTC           Long March 2D | Gaofen-9 04 & Q-SAT   
2  Tue Aug 04, 2020 23:57 UTC            Starship Prototype | 150 Meter Hop   
3  Thu Jul 30, 2020 21:25 UTC  Proton-M/Briz-M | Ekspress-80 & Ekspress-103   
4  

# Data Preprocessing 

In [2]:
# Drop the first two columns and the 'Rocket' column
df.drop(['Unnamed: 0.1', 'Unnamed: 0', ' Rocket'], axis=1, inplace=True)

# Extract countries from the 'Location' column
df['Country'] = df['Location'].str.split(',').str[-1].str.strip()

# Drop the 'Location' column
df.drop('Location', axis=1, inplace=True)

# List of invalid country names
invalid_countries = ['Shahrud Missile Test Site', 'Yellow Sea', 
                     'Pacific Missile Range Facility', 'Pacific Ocean', 'Barents Sea']

# Replace invalid country names with NaN
df['Country'] = df['Country'].replace(invalid_countries, np.nan)

# Drop rows with NaN values in the 'Country' column
df_cleaned = df.dropna(subset=['Country'])

# Display the updated DataFrame
print(df.head())

  Company Name                       Datum  \
0       SpaceX  Fri Aug 07, 2020 05:12 UTC   
1         CASC  Thu Aug 06, 2020 04:01 UTC   
2       SpaceX  Tue Aug 04, 2020 23:57 UTC   
3    Roscosmos  Thu Jul 30, 2020 21:25 UTC   
4          ULA  Thu Jul 30, 2020 11:50 UTC   

                                         Detail Status Rocket Status Mission  \
0  Falcon 9 Block 5 | Starlink V1 L9 & BlackSky  StatusActive        Success   
1           Long March 2D | Gaofen-9 04 & Q-SAT  StatusActive        Success   
2            Starship Prototype | 150 Meter Hop  StatusActive        Success   
3  Proton-M/Briz-M | Ekspress-80 & Ekspress-103  StatusActive        Success   
4                    Atlas V 541 | Perseverance  StatusActive        Success   

      Country  
0         USA  
1       China  
2         USA  
3  Kazakhstan  
4         USA  


In [8]:
# Extract the date part from the 'Datum' column using regular expressions
df['Datum'] = df['Datum'].str.extract(r'([a-zA-Z]+ \d{2}, \d{4})')

# Convert the extracted date part to datetime format with UTC timezone
df['Datum'] = pd.to_datetime(df['Datum'], utc=True)

# Extract year from the 'Datum' column
df['Year'] = df['Datum'].dt.year

# Drop the 'Datum' column
df.drop('Datum', axis=1, inplace=True)

# Display the updated DataFrame
print(df.head())

  Company Name                                        Detail Status Rocket  \
0       SpaceX  Falcon 9 Block 5 | Starlink V1 L9 & BlackSky  StatusActive   
1         CASC           Long March 2D | Gaofen-9 04 & Q-SAT  StatusActive   
2       SpaceX            Starship Prototype | 150 Meter Hop  StatusActive   
3    Roscosmos  Proton-M/Briz-M | Ekspress-80 & Ekspress-103  StatusActive   
4          ULA                    Atlas V 541 | Perseverance  StatusActive   

  Status Mission     Country  Year  
0        Success         USA  2020  
1        Success       China  2020  
2        Success         USA  2020  
3        Success  Kazakhstan  2020  
4        Success         USA  2020  


In [None]:
# Extract only the name of the rocket
df['Rocket'] = df['Detail'].str.split('|').str[0].str.strip()

# Drop the 'Detail' column
df.drop('Detail', axis=1, inplace=True)

# Display the updated DataFrame
print(df.head())

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Drop rows with missing values in the 'Country' column
df.dropna(subset=['Country'], inplace=True)

# Check if missing values have been dropped
print(df.isnull().sum())

In [None]:
import re

# Clean Company Name column
df['Company Name'] = df['Company Name'].str.replace(r'[^a-zA-Z0-9\s]', '').str.strip().str.title()

# Clean Rocket column
df['Rocket'] = df['Rocket'].str.split('|').str[0]  # Extract only the main name
df['Rocket'] = df['Rocket'].str.replace(r'[^a-zA-Z0-9\s]', '').str.strip().str.title()  # Remove special characters and standardize capitalization

In [None]:
# Check for weird values and changes in format in each column
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Column: {column}")
    print(f"Number of unique values: {len(unique_values)}")
    print("Unique values:")
    print(unique_values)
    print("\n")

# Exploratory Data Analaysis

In [None]:
# EDA for Company Name
company_name_counts = df['Company Name'].value_counts()
print("Company Name Distribution:")
print(company_name_counts)

# Visualize Company Name distribution
plt.figure(figsize=(12, 8))
company_name_counts.plot(kind='bar', color='red')
plt.title('Distribution of Company Name')
plt.xlabel('Company Name')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

# EDA for Status Rocket
status_rocket_counts = df['Status Rocket'].value_counts()
print("Status Rocket Distribution:")
print(status_rocket_counts)

# Visualize Status Rocket distribution
plt.figure(figsize=(8, 6))
status_rocket_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Rocket Status')
plt.xlabel('Rocket Status')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

# EDA for Status Mission
status_mission_counts = df['Status Mission'].value_counts()
print("Status Mission Distribution:")
print(status_mission_counts)

# Visualize Status Mission distribution
plt.figure(figsize=(8, 6))
status_mission_counts.plot(kind='bar', color='salmon')
plt.title('Distribution of Mission Status')
plt.xlabel('Mission Status')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

# EDA for Country
country_counts = df['Country'].value_counts()
print("Country Distribution:")
print(country_counts)

# Visualize Country distribution
plt.figure(figsize=(12, 8))
country_counts.plot(kind='bar', color='lightgreen')
plt.title('Distribution of Countries')
plt.xlabel('Country')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

# EDA for Year
year_counts = df['Year'].value_counts().sort_index()
print("Year Distribution:")
print(year_counts)

# Visualize Year distribution
plt.figure(figsize=(12, 8))
year_counts.plot(kind='line', marker='o', color='orange')
plt.title('Number of Launches per Year')
plt.xlabel('Year')
plt.ylabel('Number of Launches')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# EDA for Rocket
rocket_counts = df['Rocket'].value_counts()
print("Rocket Distribution:")
print(rocket_counts)

# Visualize Rocket distribution
plt.figure(figsize=(12, 8))
rocket_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Rocket Types')
plt.xlabel('Rocket Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

# Data Cleaning 

In [None]:
# One-hot encode categorical variables except for 'Year'
df_encoded = pd.get_dummies(df, columns=['Company Name', 'Status Rocket', 'Country'])

# Convert 'Status Mission' to binary
df_encoded['Status Mission'] = df_encoded['Status Mission'].apply(lambda x: 1 if x == 'Success' else 0)

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to the 'Rocket' variable
df_encoded['Rocket_encoded'] = label_encoder.fit_transform(df['Rocket'])

# Display the encoded DataFrame
print(df_encoded.head())

In [None]:
# Drop the original 'Rocket' column
df_model = df_encoded.drop(columns=['Rocket'])

# Display the encoded DataFrame
print(df_model.head())

In [None]:
print(df_model.info())

In [None]:
# Splitting the data into features (X) and target variable (y)
X = df_model.drop(columns=['Status Mission'])  # Features (excluding the target variable)
y = df_model['Status Mission']  # Target variable

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both the training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Printing the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Data Modelling 

## Logistic Regression Model 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Train the logistic regression model
logistic_reg = LogisticRegression(max_iter=10000)
logistic_reg.fit(X_train_scaled, y_train)

# Make predictions on training and test sets
y_train_pred = logistic_reg.predict(X_train_scaled)
y_test_pred = logistic_reg.predict(X_test_scaled)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the results
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

# Calculate precision, recall, and F1 score on the test set
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

# Print the results
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train the Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

# Make predictions on training and test sets
y_train_pred_rf = random_forest.predict(X_train)
y_test_pred_rf = random_forest.predict(X_test)

# Calculate accuracy
train_accuracy_rf = accuracy_score(y_train, y_train_pred_rf)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)

# Print the results
print("Training Accuracy (Random Forest):", train_accuracy_rf)
print("Test Accuracy (Random Forest):", test_accuracy_rf)

# Calculate precision, recall, and F1 score on the test set
precision_rf = precision_score(y_test, y_test_pred_rf)
recall_rf = recall_score(y_test, y_test_pred_rf)
f1_rf = f1_score(y_test, y_test_pred_rf)

# Print precision, recall, and F1 score
print("Precision (Random Forest):", precision_rf)
print("Recall (Random Forest):", recall_rf)
print("F1 Score (Random Forest):", f1_rf)

## Gradient Boosting Machine 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the GBM classifier
gbm = GradientBoostingClassifier()

# Train the GBM model
gbm.fit(X_train, y_train)

# Make predictions on training and test sets
y_train_pred_gbm = gbm.predict(X_train)
y_test_pred_gbm = gbm.predict(X_test)

# Calculate accuracy
train_accuracy_gbm = accuracy_score(y_train, y_train_pred_gbm)
test_accuracy_gbm = accuracy_score(y_test, y_test_pred_gbm)

# Calculate precision, recall, and F1 score on the test set
precision_gbm = precision_score(y_test, y_test_pred_gbm)
recall_gbm = recall_score(y_test, y_test_pred_gbm)
f1_gbm = f1_score(y_test, y_test_pred_gbm)

# Print the results
print("Training Accuracy (GBM):", train_accuracy_gbm)
print("Test Accuracy (GBM):", test_accuracy_gbm)
print("Precision (GBM):", precision_gbm)
print("Recall (GBM):", recall_gbm)
print("F1 Score (GBM):", f1_gbm)

In [None]:
import pandas as pd

# Define models and their performance metrics
models = ['Logistic Regression', 'Random Forest', 'Gradient Boosting']
train_accuracy = [train_accuracy, train_accuracy_rf, train_accuracy_gbm]
test_accuracy = [test_accuracy, test_accuracy_rf, test_accuracy_gbm]
precision = [precision, precision_rf, precision_gbm]
recall = [recall, recall_rf, recall_gbm]
f1 = [f1, f1_rf, f1_gbm]

# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Model': models,
    'Training Accuracy': train_accuracy,
    'Test Accuracy': test_accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
})

# Print the summary table
print(summary_df)

# Feauture extraction 

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt

# Assuming you have X_train_scaled and y_train as your training data and labels
# Initialize the logistic regression model
logistic_reg = LogisticRegression(max_iter=10000)

# Train the model on your training data (X_train_scaled and y_train)
logistic_reg.fit(X_train_scaled, y_train)

# Get the coefficients of the trained model
coefficients = logistic_reg.coef_[0]

# Get feature names
feature_names = X_train.columns

# Sort the coefficients based on their absolute values
sorted_indices = np.argsort(np.abs(coefficients))[::-1]

# Map sorted indices to feature names
sorted_feature_names = feature_names[sorted_indices]

# Plot feature importances with feature names
plt.figure(figsize=(10, 6))
plt.bar(range(X_train_scaled.shape[1]), np.abs(coefficients[sorted_indices]))
plt.xlabel('Feature Index')
plt.ylabel('Coefficient (Log Odds)')
plt.title('Feature Importance (Logistic Regression)')
plt.xticks(range(X_train_scaled.shape[1]), sorted_feature_names, rotation=90)
plt.show()

In [None]:
# Retrieve coefficients (log odds)
coefficients = logistic_reg.coef_[0]

# Assuming you have X_train_scaled and y_train as your training data and labels
# Initialize the logistic regression model
logistic_reg = LogisticRegression(max_iter=10000)

# Train the model on your training data (X_train_scaled and y_train)
logistic_reg.fit(X_train_scaled, y_train)

# Get the coefficients of the trained model
coefficients = logistic_reg.coef_[0]

# Create a DataFrame to store feature names and their coefficients
feature_coef_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': coefficients})

# Sort features based on coefficient magnitude
sorted_features = feature_coef_df.sort_values(by='Coefficient', ascending=False)

# Display sorted features and their coefficients
print("Sorted Features based on Coefficients (Logistic Regression):\n", sorted_features)

# Fit a tree-based model (e.g., Random Forest)
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

# Retrieve feature importance scores
importances = random_forest.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort features based on importance scores
sorted_features_rf = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display sorted features and their importance scores
print("\nSorted Features based on Importance (Random Forest):\n", sorted_features_rf)

from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting model
gbm = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm.fit(X_train, y_train)

# Retrieve feature importance scores
importances_gbm = gbm.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importance_df_gbm = pd.DataFrame({'Feature': X.columns, 'Importance': importances_gbm})

# Sort features based on importance scores
sorted_features_gbm = feature_importance_df_gbm.sort_values(by='Importance', ascending=False)

# Display sorted features and their importance scores
print("\nSorted Features based on Importance (Gradient Boosting Machine):\n", sorted_features_gbm)

# Feature Selection

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Selecting the desired features
selected_features = ["Year", "Rocket_encoded", "Company Name_Rvsn Ussr", "Country_USA"]
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Train the logistic regression model on the selected features
logistic_reg_selected = LogisticRegression(max_iter=10000)
logistic_reg_selected.fit(X_train_selected, y_train)

# Make predictions on the test set
y_test_pred_selected = logistic_reg_selected.predict(X_test_selected)

# Calculate the accuracy of the model
accuracy_selected = accuracy_score(y_test, y_test_pred_selected)
print("Accuracy with selected features:", accuracy_selected)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Retain only the specified features
selected_features = ['Year', 'Rocket_encoded', 'Company Name_Us Navy', 'Country_Iran']
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Initialize and train the Random Forest model with selected features
random_forest_selected = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_selected.fit(X_train_selected, y_train)

# Make predictions on test sets
y_test_pred_selected = random_forest_selected.predict(X_test_selected)

# Calculate accuracy with selected features
test_accuracy_selected = accuracy_score(y_test, y_test_pred_selected)

# Print the results
print("Accuracy with selected features:", test_accuracy_selected)

## Gradient Boosting Machine 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Retain only the specified features
selected_features = ['Year', 'Rocket_encoded', 'Company Name_Isa', 'Company Name_Us Air Force']
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Initialize and train the Gradient Boosting model with selected features
gbm_selected = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm_selected.fit(X_train_selected, y_train)

# Make predictions on test sets
y_test_pred_selected = gbm_selected.predict(X_test_selected)

# Calculate accuracy with selected features
test_accuracy_selected = accuracy_score(y_test, y_test_pred_selected)

# Print the results
print("Test Accuracy with selected features:", test_accuracy_selected)

# Cross Validation 

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# Initialize models
logistic_reg = LogisticRegression(max_iter=10000)  # Adjust max_iter as needed
random_forest = RandomForestClassifier()
gradient_boosting = GradientBoostingClassifier()
neural_network = MLPClassifier()

# Perform cross-validation for each model
models = {
    'Logistic Regression': logistic_reg,
    'Random Forest': random_forest,
    'Gradient Boosting': gradient_boosting,
}

for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')  # 5-fold cross-validation
    print(f'{name} Cross-Validation Accuracy: {cv_scores.mean()} +/- {cv_scores.std()}')

# Best Parameters with Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the hyperparameters and their values to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],  # Regularization parameter
    'max_iter': [100, 200, 300, 400, 500, 1000, 5000]  # Maximum number of iterations
}

# Create the grid search object
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')

# Perform the grid search on the training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Get the best estimator
best_logistic_reg = grid_search.best_estimator_

# Make predictions on the test set using the best estimator
y_test_pred = best_logistic_reg.predict(X_test_scaled)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy (after hyperparameter tuning):", test_accuracy)

# Visualization

## Mission success and failures by Year and Rocket 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the figure and axes
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

# Plot the distribution of successful and failed missions over years
sns.histplot(data=df_model, x='Year', hue='Status Mission', multiple='stack', ax=axes[0])
axes[0].set_title('Mission Success by Year')

# Plot the distribution of successful and failed missions for each rocket
sns.countplot(data=df_model, x='Rocket_encoded', hue='Status Mission', ax=axes[1])
axes[1].set_title('Mission Success by Rocket')

# Rotate x-axis labels for better readability
axes[1].tick_params(axis='x', rotation=90)

# Show the plots
plt.tight_layout()
plt.show()

## Relationship between Russia and Kazakhstan and Number of launches by year 

In [None]:
import matplotlib.pyplot as plt

# Filter the DataFrame for launches from Russia and Kazakhstan
russia_df = df_model[df_model['Country_Russia'] == 1]
kazakhstan_df = df_model[df_model['Country_Kazakhstan'] == 1]

# Group the filtered DataFrames by year and count the number of launches each year
russia_launches_by_year = russia_df['Year'].value_counts().sort_index()
kazakhstan_launches_by_year = kazakhstan_df['Year'].value_counts().sort_index()

# Plotting the relationship between countries and years of launch
plt.figure(figsize=(12, 6))
plt.plot(russia_launches_by_year.index, russia_launches_by_year.values, label='Russia')
plt.plot(kazakhstan_launches_by_year.index, kazakhstan_launches_by_year.values, label='Kazakhstan')
plt.title('Number of Launches by Year for Russia and Kazakhstan')
plt.xlabel('Year')
plt.ylabel('Number of Launches')
plt.legend()
plt.grid(True)
plt.show()

## Total number of launches per year

In [None]:
# Time Series Analysis: Number of launches per year
launches_per_year = df_model['Year'].value_counts().sort_index()
plt.figure(figsize=(12, 6))
sns.lineplot(x=launches_per_year.index, y=launches_per_year.values)
plt.xlabel('Year')
plt.ylabel('Number of Launches')
plt.title('Number of Launches per Year')
plt.show()

## USA 

In [None]:
# Filter the DataFrame for launches from the USA
usa_df = df_encoded[df_encoded['Country_USA'] == 1]

# Group the data by Rocket and calculate the number of launches for each rocket launched by the USA
usa_rocket_counts = usa_df['Rocket'].value_counts()

# Find the most used rocket launched by the USA
most_used_rocket = usa_rocket_counts.idxmax()
print("USA's most used rocket:", most_used_rocket)
print("Number of Launches for", most_used_rocket, ":", usa_rocket_counts[most_used_rocket])

In [None]:
# Filter the DataFrame for launches from the USA
usa_df = df_encoded[df_encoded['Country_USA'] == 1]

# Group the data by Rocket and calculate the number of launches for each rocket launched by the USA
usa_rocket_counts = usa_df['Rocket'].value_counts()

# Filter out rockets with more than 50 launches
usa_rocket_counts = usa_rocket_counts[usa_rocket_counts > 30]

# Filter the USA DataFrame based on the selected rockets
usa_df_filtered = usa_df[usa_df['Rocket'].isin(usa_rocket_counts.index)]

# Group the filtered data by Rocket and calculate the success rate for each rocket
usa_rocket_success_rate = usa_df_filtered.groupby('Rocket')['Status Mission'].mean()

# Find the top 3 rockets launched by the USA
top_usa_rockets = usa_rocket_success_rate.nlargest(3)
print("Top 3 rockets launched by the USA with more than 50 launches:")
print(top_usa_rockets)

## Russia 

In [None]:
# Filter the DataFrame for launches from Russia
russia_df = df_encoded[df_encoded['Country_Russia'] == 1]

# Count the occurrences of each rocket
rocket_counts = russia_df['Rocket'].value_counts()

# Identify the rocket with the highest count
most_used_rocket = rocket_counts.idxmax()

# Print the name of Russia's most used rocket
print("Russia's most used rocket:", most_used_rocket)
print("Number of Launches for", most_used_rocket, ":", rocket_counts[most_used_rocket])

In [None]:
# Filter the DataFrame for launches from Russia
russia_df = df_encoded[df_encoded['Country_Russia'] == 1]

# Group the filtered DataFrame by rocket name and calculate the success rate for each rocket
rocket_success_rate = russia_df.groupby('Rocket')['Status Mission'].mean()

# Filter out rockets with fewer than 50 launches
rocket_counts = russia_df['Rocket'].value_counts()
rocket_success_rate = rocket_success_rate[rocket_counts > 50]

# Find the top 3 rockets with the highest success rates among those with more than 50 launches
top_3_rockets = rocket_success_rate.nlargest(3)

print("Top 3 rockets in Russia with more than 50 launches:")
print(top_3_rockets)

## Realtionship between NASA, USSR and SpaceX

In [None]:
import matplotlib.pyplot as plt

# Calculate the mean success rate for launches by NASA and RVSN USSR
nasa_success_rate = df_encoded[df_encoded['Company Name_Nasa'] == 1]['Status Mission'].mean()
rvsn_success_rate = df_encoded[df_encoded['Company Name_Rvsn Ussr'] == 1]['Status Mission'].mean()
spacex_success_rate = df_encoded[df_encoded['Company Name_Spacex'] == 1]['Status Mission'].mean()

# Count the number of launches by NASA, RVSN USSR, and SpaceX
nasa_launches = df_encoded['Company Name_Nasa'].sum()
rvsn_launches = df_encoded['Company Name_Rvsn Ussr'].sum()
spacex_launches = df_encoded['Company Name_Spacex'].sum()

# Print the success rates
print("NASA success rate:", nasa_success_rate)
print("Number of launches by NASA:", nasa_launches)
print("\nRVSN USSR success rate:", rvsn_success_rate)
print("Number of launches by RVSN USSR:", rvsn_launches)
print("\nSpaceX success rate:", spacex_success_rate)
print("Number of launches by SpaceX:", spacex_launches)

# Plotting the comparison
plt.bar(['NASA', 'RVSN USSR', 'SpaceX'], [nasa_success_rate, rvsn_success_rate, spacex_success_rate])
plt.xlabel('Company')
plt.ylabel('Success Rate')
plt.title('\nComparison of Success Rates: NASA vs RVSN USSR vs SpaceX')
plt.ylim(0, 1)  # Set y-axis limit from 0 to 1
plt.show()