In [None]:
# Import required libraries
import pandas as pd
import re

# Load your CSV dataset
csv_path = '/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/Astronauts.csv'
df = pd.read_csv(csv_path)

# Handle missing values
df['mission_title'].fillna('Unknown', inplace=True)

# Drop unnecessary 'id' column if present (duplicates 'number')
if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)

# Function to clean text fields
def clean_text(text):
    return re.sub(r'\s+', ' ', str(text)).strip()

# Generate structured text chunks and store in a new column
def generate_text_chunk(row):
    chunk = (
        f"Astronaut ID: {row['number']}. "
        f"Name: {row['name']}. "
        f"Nationality: {row['nationality']}. "
        f"Mission Title: {row['mission_title']}. "
        f"Mission Duration: {row['hours_mission']} hours. "
        f"EVA Duration: {row['eva_hrs_mission']} hours. "
        f"Total EVA Hours: {row['total_eva_hrs']} hours. "
        f"Total Mission Hours: {row['total_hrs_sum']} hours."
    )
    return clean_text(chunk)

# Apply chunk generation to each row
df['text_chunk'] = df.apply(generate_text_chunk, axis=1)

# Save cleaned data to new CSV file on Google Drive
output_csv_path = '/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/Astronauts_Cleaned.csv'
df.to_csv(output_csv_path, index=False)

# Verify by displaying first few chunks
print(df[['number', 'text_chunk']].head(3))

print(f"\nCleaned data saved successfully at:\n{output_csv_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   number                                         text_chunk
0       1  Astronaut ID: 1. Name: Gagarin, Yuri. National...
1       2  Astronaut ID: 2. Name: Titov, Gherman. Nationa...
2       3  Astronaut ID: 3. Name: Glenn, John H., Jr.. Na...

Cleaned data saved successfully at:
/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/Astronauts_Cleaned.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['mission_title'].fillna('Unknown', inplace=True)


In [None]:
# Import libraries
import pandas as pd
import re

# Load dataset from Google Drive
csv_path = '/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/Exoplanets.csv'
df_exo = pd.read_csv(csv_path)

# Inspect for missing values
print(df_exo.isnull().sum())

# Handle missing values (filling with placeholders or dropping)
df_exo.fillna('Unknown', inplace=True)

# Clean text function
def clean_text(text):
    return re.sub(r'\s+', ' ', str(text)).strip()

# Create structured text chunks
text_chunks_exo = []
for idx, row in df_exo.iterrows():
    chunk = f"""
    Exoplanet Name: {row['name']}.
    Distance from Earth: {row['distance']} light years.
    Stellar System: {row.get('stellar_system', 'Unknown')}.
    Discovery Method: {row.get('detection', 'Unknown')}.
    Year of Discovery: {row['discovery_year']}.
    Orbital Period: {row.get('orbital_period', 'Unknown')} days.
    Planet Mass: {row.get('mass', 'Unknown')} Jupiter masses.
    Planet Radius: {row.get('radius', 'Unknown')}.
    """
    df_exo.at[idx, 'text_chunk'] = ' '.join(str(chunk).split())

# Save cleaned data as CSV
output_csv_path = '/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/Exoplanets_Cleaned.csv'
df_exo.to_csv(output_csv_path, index=False)

# Verify chunks
print(df_exo[['name', 'text_chunk']].head(3))


name                   0
distance              17
stellar_magnitude    161
planet_type            0
discovery_year         0
mass_multiplier       23
mass_wrt              23
radius_multiplier     17
radius_wrt            17
orbital_radius       289
orbital_period         0
eccentricity           0
detection_method       0
dtype: int64


  df_exo.fillna('Unknown', inplace=True)


                   name                                         text_chunk
0  11 Comae Berenices b  Exoplanet Name: 11 Comae Berenices b. Distance...
1    11 Ursae Minoris b  Exoplanet Name: 11 Ursae Minoris b. Distance f...
2       14 Andromedae b  Exoplanet Name: 14 Andromedae b. Distance from...


In [None]:
# Import libraries
import pandas as pd
import re

# Load Planets.csv dataset from Google Drive
csv_path = '/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/planets.csv'
df_planets = pd.read_csv(csv_path)

# Inspect missing values briefly
print(df_planets.isnull().sum())

# Fill missing values with placeholders ('Unknown')
df_planets.fillna('Unknown', inplace=True)

# Function to clean text
def clean_text(text):
    return re.sub(r'\s+', ' ', str(text)).strip()

# Generate meaningful text chunks
for idx, row in df_planets.iterrows():
    chunk = f"""
    Host Name: {row['pl_hostname']}.
    Planet Letter: {row['pl_letter']}.
    Discovery Method: {row['pl_discmethod']}.
    Number of Planets in System: {row['pl_pnum']}.
    Orbital Period: {row['pl_orbper']} days.
    Orbital Semi-major Axis: {row['pl_orbsmax']} AU.
    Orbital Eccentricity: {row['pl_orbeccen']}.
    Orbital Inclination: {row['pl_orbincl']} degrees.
    Planet Mass: {row['pl_bmassj']} Jupiter masses.
    Planet Radius: {row['pl_radj']} Jupiter radii.
    Stellar Distance: {row['st_dist']} parsecs.
    Stellar Magnitude: {row['st_optmag']}.
    Stellar Temperature: {row['st_teff']} K.
    Stellar Mass: {row['st_mass']} Solar masses.
    Stellar Radius: {row['st_rad']} Solar radii.
    """
    df_planets.at[idx, 'text_chunk'] = clean_text(chunk)

# Save cleaned dataset as CSV
output_csv_path = '/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/Planets_Cleaned.csv'
df_planets.to_csv(output_csv_path, index=False)

# Display first few chunks for verification
print(df_planets[['pl_hostname', 'text_chunk']].head(3))


rowid              0
pl_hostname        0
pl_letter          0
pl_discmethod      0
pl_pnum            0
                ... 
st_raderr1       417
st_raderr2       494
st_radlim        358
st_radblend      187
rowupdate          0
Length: 67, dtype: int64


  df_planets.fillna('Unknown', inplace=True)


  pl_hostname                                         text_chunk
0      11 Com  Host Name: 11 Com. Planet Letter: b. Discovery...
1      11 UMi  Host Name: 11 UMi. Planet Letter: b. Discovery...
2      14 And  Host Name: 14 And. Planet Letter: b. Discovery...


In [None]:
# Import libraries
import pandas as pd
import re

# Load SpaceXMission.csv from Google Drive
csv_path = '/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/SpaceXMission.csv'
df_spacex = pd.read_csv(csv_path)

# Quickly inspect missing values
print(df_spacex.isnull().sum())

# Fill missing values with placeholders ('Unknown')
df_spacex.fillna('Unknown', inplace=True)

# Clean text function
def clean_text(text):
    return re.sub(r'\s+', ' ', str(text)).strip()

# Generate structured text chunks for each SpaceX mission
for idx, row in df_spacex.iterrows():
    chunk = f"""
    Flight Number: {row['Flight Number']}.
    Launch Date: {row['Launch Date']} at {row['Launch Time']}.
    Launch Site: {row['Launch Site']}.
    Vehicle Type: {row['Vehicle Type']}.
    Payload Name: {row['Payload Name']} ({row['Payload Type']}), weighing {row['Payload Mass (kg)']} kg.
    Orbit: {row['Payload Orbit']}.
    Customer: {row['Customer Name']} ({row['Customer Type']}, {row['Customer Country']}).
    Mission Outcome: {row['Mission Outcome']}.
    Failure Reason: {row['Failure Reason']}.
    Landing Type: {row['Landing Type']}, Landing Outcome: {row['Landing Outcome']}.
    """
    df_spacex.at[idx, 'text_chunk'] = clean_text(chunk)

# Save cleaned dataset as CSV
output_csv_path = '/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/SpaceXMission_Cleaned.csv'
df_spacex.to_csv(output_csv_path, index=False)

# Verify by showing the first few text chunks
print(df_spacex[['Flight Number', 'text_chunk']].head(3))


Flight Number         0
Launch Date           0
Launch Time           0
Launch Site           0
Vehicle Type          0
Payload Name          0
Payload Type          3
Payload Mass (kg)     8
Payload Orbit         5
Customer Name         2
Customer Type         2
Customer Country      2
Mission Outcome       0
Failure Reason       33
Landing Type         20
Landing Outcome      20
dtype: int64
  Flight Number                                         text_chunk
0          F1-1  Flight Number: F1-1. Launch Date: 24 March 200...
1          F1-2  Flight Number: F1-2. Launch Date: 21 March 200...
2          F1-3  Flight Number: F1-3. Launch Date: 3 August 200...


  df_spacex.fillna('Unknown', inplace=True)


In [None]:
import pandas as pd
import ast
import re

# Load dataset
csv_path = '/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/NasaMarsRover/details.csv'
df = pd.read_csv(csv_path)

# Function to safely parse string dictionaries into Python dictionaries
def parse_dict(s):
    try:
        return ast.literal_eval(s)
    except:
        return {}

# Apply parsing
df['camera'] = df['camera'].apply(parse_dict)
df['rover'] = df['rover'].apply(parse_dict)

# Clean text function
def clean_text(text):
    return re.sub(r'\s+', ' ', str(text)).strip()

# Generate structured text chunks
def create_text_chunk(row):
    camera = row['camera']
    rover = row['rover']
    chunk = f"""
    Image ID: {row['id']}.
    Martian Sol (Day): {row['sol']}.
    Earth Date: {row['earth_date']}.
    Camera: {camera.get('full_name', 'Unknown')} (Camera ID: {camera.get('id', 'Unknown')}).
    Rover: {rover.get('name', 'Unknown')}, Status: {rover.get('status', 'Unknown')}.
    Rover Landing Date: {rover.get('landing_date', 'Unknown')}.
    Rover Launch Date: {rover.get('launch_date', 'Unknown')}.
    Total Photos Taken by Rover: {rover.get('total_photos', 'Unknown')}.
    """
    return clean_text(chunk)

# Apply to DataFrame
df['text_chunk'] = df.apply(create_text_chunk, axis=1)

# Save cleaned CSV
output_csv_path = '/content/drive/MyDrive/Colab Notebooks/RAG-Chatbot/MarsRover_Cleaned.csv'
df.to_csv(output_csv_path, index=False)

# Verify first few chunks
print(df[['id', 'text_chunk']].head(3))


       id                                         text_chunk
0  102693  Image ID: 102693. Martian Sol (Day): 1000. Ear...
1  102694  Image ID: 102694. Martian Sol (Day): 1000. Ear...
2  102850  Image ID: 102850. Martian Sol (Day): 1000. Ear...


In [None]:
!pip install sentence-transformers pandas


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 