In [8]:
import pandas as pd

# 1. Load only the meteorite_id column from your cleaned landings:
cleaned = pd.read_csv("cleaned_meteorite_landings.csv", usecols=["meteorite_id"])

# 2. Load the raw lat/lon from the original:
raw_loc = pd.read_csv(
    "Meteorite_Landings.csv",
    usecols=["id", "reclat", "reclong"]
).rename(columns={
    "id": "meteorite_id",
    "reclat": "latitude",
    "reclong": "longitude"
})

# 3. Keep just those meteorite_ids that survived your cleaning:
raw_loc = raw_loc[raw_loc["meteorite_id"].isin(cleaned["meteorite_id"])]

# 4. Helper to bucket region/country
def approx_region_country(lat, lon):
    if 24.5 <= lat <= 49.0 and -125.0 <= lon <= -66.9: return "North America", "USA"
    if -33.0 <= lat <= 5.0   and -74.0 <= lon <= -34.0: return "South America", "Brazil"
    if 35.0 <= lat <= 71.0   and -10.0 <= lon <= 40.0:   return "Europe",        "Germany"
    if -44.0 <= lat <= -10.0 and 112.0 <= lon <= 154.0:  return "Oceania",       "Australia"
    if 20.0 <= lat <= 55.0   and 60.0 <= lon <= 100.0:   return "Asia",          "China"
    if -35.0 <= lat <= 37.0  and -17.0 <= lon <= 51.0:   return "Africa",        "Egypt"
    if lat > 70:    return "Arctic",      "Arctic Region"
    if lat < -60:   return "Antarctica",  "Antarctica"
    if 10.0 <= lat <= 20.0 and -85.0 <= lon <= -60.0:    return "Caribbean",     "Caribbean Sea"
    if -10.0 <= lat <= 10.0 and 95.0 <= lon <= 141.0:    return "Southeast Asia","Indonesia"
    return "Other", "Unknown"

# 5. Build the new locations list
locs = []
for row in raw_loc.itertuples(index=False):
    lat, lon = row.latitude, row.longitude
    if pd.isnull(lat) or pd.isnull(lon):
        region, country = "Other", "Unknown"
    else:
        region, country = approx_region_country(lat, lon)
    locs.append({
        "location_id": row.meteorite_id,
        "latitude":    lat,
        "longitude":   lon,
        "region":      region,
        "country":     country
    })

# 6. Write it out
pd.DataFrame(locs).to_csv("locations.csv", index=False)
print(f"Regenerated locations.csv with {len(locs)} rows")


Regenerated locations.csv with 38114 rows


In [6]:
import pandas as pd

# 1. Read your original scientists file
df = pd.read_csv("scientists.csv")

# 2. Reorder the columns to match your PG table:
#    scientist_id | email | institution_id | scientist_name
df = df[["scientist_id", "email", "institution_id", "scientist_name"]]

# 3. Overwrite the original scientists.csv with the fixed ordering
df.to_csv("scientists.csv", index=False)

print("Rewrote scientists.csv with columns:", list(df.columns))


Rewrote scientists.csv with columns: ['scientist_id', 'email', 'institution_id', 'scientist_name']


In [9]:
import pandas as pd

# 1. Read your cleaned “landing” CSV
df = pd.read_csv("cleaned_meteorite_landings.csv")

# 2. Select—and in this exact order—the columns your PG table expects:
#    meteorite_id | class_id | location_id | meteorite_name | mass | fall_type | fall_year
df_fixed = df[
    [
        "meteorite_id",
        "class_id",
        "location_id",
        "meteorite_name",
        "mass",
        "fall_type",
        "fall_year",
    ]
]

# 3. Write it out, overwriting your import‐source (or to a new file):
df_fixed.to_csv("meteorites.csv", index=False)

print("Wrote meteorites.csv with columns:", list(df_fixed.columns))


Wrote meteorites.csv with columns: ['meteorite_id', 'class_id', 'location_id', 'meteorite_name', 'mass', 'fall_type', 'fall_year']


In [10]:
import pandas as pd

# 1. Load the CSV you just exported
df = pd.read_csv("research_papers.csv")

# 2. Re‐order to match the exact column order in your PG table:
df = df[[
    "paper_id",
    "scientist_id",
    "title",
    "journal_name",
    "publication_date",
]]

# 3. Overwrite the CSV so that pgAdmin will import it correctly
df.to_csv("research_papers.csv", index=False)

print("research_papers.csv rewritten with columns:", list(df.columns))


research_papers.csv rewritten with columns: ['paper_id', 'scientist_id', 'title', 'journal_name', 'publication_date']
