# Analisi dei Meteoriti

## import delle librerie

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


## caricamento del dataset

In [2]:
# Caricare il dataset
df = pd.read_csv("Meteorite_Landings.csv")
print(f"Dataset caricato: {df.shape[0]} righe, {df.shape[1]} colonne.")
print("Colonne presenti:", list(df.columns))

Dataset caricato: 45716 righe, 10 colonne.
Colonne presenti: ['name', 'id', 'nametype', 'recclass', 'mass (g)', 'fall', 'year', 'reclat', 'reclong', 'GeoLocation']


In [3]:
# Panoramica dei dati
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass (g)     45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45425 non-null  float64
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 3.5+ MB


Unnamed: 0,id,mass (g),year,reclat,reclong
count,45716.0,45585.0,45425.0,38401.0,38401.0
mean,26889.735104,13278.08,1991.828817,-39.12258,61.074319
std,16860.68303,574988.9,25.052766,46.378511,80.647298
min,1.0,0.0,860.0,-87.36667,-165.43333
25%,12688.75,7.2,1987.0,-76.71424,0.0
50%,24261.5,32.6,1998.0,-71.5,35.66667
75%,40656.75,202.6,2003.0,0.0,157.16667
max,57458.0,60000000.0,2101.0,81.16667,354.47333


## Cleaning

### Gestione valori mancanti

In [4]:
missing_pct = df.isnull().sum() / len(df) * 100
cols_to_drop = missing_pct[missing_pct > 50].index.tolist()

if cols_to_drop:
    print(f"\nRimozione colonne con >50% missing: {cols_to_drop}")
    df.drop(columns=cols_to_drop, inplace=True)

# Mostra colonne con almeno un missing
missing_per_col = df.isnull().sum()
missing_per_col = missing_per_col[missing_per_col > 0]
print("\nValori mancanti prima dell'imputazione:")
print(missing_per_col)


Valori mancanti prima dell'imputazione:
mass (g)        131
year            291
reclat         7315
reclong        7315
GeoLocation    7315
dtype: int64


### Imputazione valori mancanti

In [5]:
# Numeri → media
for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].mean(), inplace=True)

# Oggetti → moda (o 'Unknown' per le categorie)
for col in df.select_dtypes(include=['object']).columns:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
        df[col].fillna(mode_val, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_val, inplace=True)


### Rimozione duplicati

In [6]:
### Rimozione duplicati
# Utilizzo di 'name' e 'mass (g)' come subset logico, ma 'id' dovrebbe essere l'unico identificatore
df.drop_duplicates(subset=['name', 'mass (g)'], keep='first', inplace=True)
print(f"\nRighe dopo rimozione duplicati (su nome e massa): {df.shape[0]}")


Righe dopo rimozione duplicati (su nome e massa): 45716


### gestione degli outlier

In [7]:
### gestione degli outlier (basata su IQR)
col_num = df.select_dtypes(include=[np.number]).columns
print("\nGestione Outlier (IQR) per colonne numeriche:")
for col in col_num:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    # Clipa solo per le colonne non geografiche per mantenere l'integrità delle coordinate
    if col not in ['reclat', 'reclong']:
        df[col] = df[col].clip(lower, upper)
        print(f" - {col} clippato tra {lower:.2f} e {upper:.2f}")

# Pulizia specifica: Massa non può essere negativa
if 'mass (g)' in df.columns:
    df['mass (g)'] = df['mass (g)'].clip(lower=0, upper=df['mass (g)'].max())


Gestione Outlier (IQR) per colonne numeriche:
 - id clippato tra -29263.25 e 82608.75
 - mass (g) clippato tra -291.00 e 504.20
 - year clippato tra 1963.00 e 2027.00


### standardizzazione stringhe e gestione categorie

In [8]:
### standardizzazione stringhe e gestione categorie
print("\nStandardizzazione stringhe e gestione categorie...")
for col in df.select_dtypes(include=['object']).columns:
    # Handle NaN, strip, title case, fill NaN again
    df[col] = df[col].astype(str).str.strip().replace(['', 'N/A', 'null', 'nan'], np.nan)
    df[col] = df[col].str.title()
    df[col].fillna('Unknown', inplace=True)
    
    # Convert to category if cardinality is low (e.g., < 50% unique values)
    if df[col].nunique() / len(df[col]) < 0.5:
        df[col] = df[col].astype('category')
        print(f" - Colonna '{col}' convertita a 'category'.")


Standardizzazione stringhe e gestione categorie...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


 - Colonna 'nametype' convertita a 'category'.
 - Colonna 'recclass' convertita a 'category'.
 - Colonna 'fall' convertita a 'category'.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behave

 - Colonna 'GeoLocation' convertita a 'category'.


In [9]:
# Gestione della colonna 'year'
if 'year' in df.columns:
    # 'year' spesso ha un formato di timestamp (es. '2003-01-01T00:00:00.000')
    df['year'] = pd.to_datetime(df['year'], errors='coerce')
    
    # Handle future dates (set to NaT)
    current_year = pd.Timestamp.now().year
    df['year'] = df['year'].apply(lambda x: x if pd.notnull(x) and x.year <= current_year else pd.NaT)

    # Extract the year, fill missing values with the median year
    median_year = df['year'].dt.year.median()
    df['discovery_year'] = df['year'].dt.year.fillna(median_year).astype('uint16')
    df.drop('year', axis=1, inplace=True)
    print(f" - Colonna 'year' pulita e convertita in 'discovery_year' (uint16).")

 - Colonna 'year' pulita e convertita in 'discovery_year' (uint16).


In [10]:
# Coerenza tipi di dato (ottimizzazione generale)
print("\nOttimizzazione tipi di dato...")
# Interi
for col in df.select_dtypes(include=['int64', 'int32']).columns:
    col_min, col_max = df[col].min(), df[col].max()
    if col_min >= 0:
        if col_max <= 255:
            df[col] = df[col].astype('uint8')
        elif col_max <= 65535:
            df[col] = df[col].astype('uint16')
        else:
            df[col] = df[col].astype('uint32')
            
# Float
for col in df.select_dtypes(include=['float64']).columns:
    df[col] = df[col].astype('float32')
    
print("Ottimizzazione completata.")

# Conversione esplicita di 'fall' e 'recclass' a category (se non già fatto)
if 'fall' in df.columns:
    df['fall'] = df['fall'].astype('category')
if 'recclass' in df.columns:
    df['recclass'] = df['recclass'].astype('category')


Ottimizzazione tipi di dato...
Ottimizzazione completata.


## Salvataggio dataset pulito

In [11]:
## Salvataggio dataset pulito
out_file = 'meteorite_clean.csv'
df.to_csv(out_file, index=False)
print(f"\nDataset pulito salvato in '{out_file}'")


Dataset pulito salvato in 'meteorite_clean.csv'


In [12]:
# report statistico
print("\n--- TIPI DATI FINALI ---")
print(df.dtypes)
print(f"Righe finali: {df.shape[0]}")


--- TIPI DATI FINALI ---
name                object
id                  uint16
nametype          category
recclass          category
mass (g)           float32
fall              category
reclat             float32
reclong            float32
GeoLocation       category
discovery_year      uint16
dtype: object
Righe finali: 45716


In [13]:
print("\n--- STATISTICHE DESCRITTIVE (Chiave) ---")
stats_cols = ['mass (g)', 'reclat', 'reclong', 'discovery_year']
stats_cols = [c for c in stats_cols if c in df.columns]
print(df[stats_cols].describe().T)


--- STATISTICHE DESCRITTIVE (Chiave) ---
                  count         mean         std          min          25%  \
mass (g)        45716.0   136.964417  185.121796     0.000000     7.200000   
reclat          45716.0   -39.122581   42.506275   -87.366669   -76.266670   
reclong         45716.0    61.074318   73.913895  -165.433334    13.256125   
discovery_year  45716.0  1970.000000    0.000000  1970.000000  1970.000000   

                        50%      75%          max  
mass (g)          32.985001   206.00   504.200012  
reclat           -39.122581     0.00    81.166672  
reclong           58.411564   155.75   354.473328  
discovery_year  1970.000000  1970.00  1970.000000  
