In [1]:
import pandas as pd
from library.utilities import check_missing

# Specify the path to your CSV file
file_path = '../Datasets/REMS_Mars_Dataset.csv'

# Load the CSV file into a Pandas DataFrame
mars_df = pd.read_csv(file_path)

# Display the first few rows to confirm successful import
print(mars_df.head())

         earth_date_time           mars_date_time sol_number  \
0  Earth, 2022-01-26 UTC  Mars, Month 6 - LS 163°   Sol 3368   
1  Earth, 2022-01-25 UTC  Mars, Month 6 - LS 163°   Sol 3367   
2  Earth, 2022-01-24 UTC  Mars, Month 6 - LS 162°   Sol 3366   
3  Earth, 2022-01-23 UTC  Mars, Month 6 - LS 162°   Sol 3365   
4  Earth, 2022-01-22 UTC  Mars, Month 6 - LS 161°   Sol 3364   

  max_ground_temp(°C) min_ground_temp(°C) max_air_temp(°C) min_air_temp(°C)  \
0                  -3                 -71               10              -84   
1                  -3                 -72               10              -87   
2                  -4                 -70                8              -81   
3                  -6                 -70                9              -91   
4                  -7                 -71                8              -92   

  mean_pressure(Pa)      wind_speed(m/h)          humidity(%) sunrise sunset  \
0               707  Value not available  Value not availabl

In [2]:
mars_df.describe()

Unnamed: 0,earth_date_time,mars_date_time,sol_number,max_ground_temp(°C),min_ground_temp(°C),max_air_temp(°C),min_air_temp(°C),mean_pressure(Pa),wind_speed(m/h),humidity(%),sunrise,sunset,UV_Radiation,weather
count,3197,3197,3197,3197,3197,3197,3197,3197,3197,3197,3197,3197,3170,3197
unique,3197,360,3197,52,40,51,78,224,1,1,90,96,4,2
top,"Earth, 2022-01-26 UTC","Mars, Month 3 - LS 86°",Sol 3368,-6,-79,14,-78,873,Value not available,Value not available,05:18,17:20,moderate,Sunny
freq,1,12,1,130,221,154,165,47,3197,3197,170,199,1450,3194


In [3]:
mars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3197 entries, 0 to 3196
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   earth_date_time      3197 non-null   object
 1   mars_date_time       3197 non-null   object
 2   sol_number           3197 non-null   object
 3   max_ground_temp(°C)  3197 non-null   object
 4   min_ground_temp(°C)  3197 non-null   object
 5   max_air_temp(°C)     3197 non-null   object
 6   min_air_temp(°C)     3197 non-null   object
 7   mean_pressure(Pa)    3197 non-null   object
 8   wind_speed(m/h)      3197 non-null   object
 9   humidity(%)          3197 non-null   object
 10  sunrise              3197 non-null   object
 11  sunset               3197 non-null   object
 12  UV_Radiation         3170 non-null   object
 13  weather              3197 non-null   object
dtypes: object(14)
memory usage: 349.8+ KB


In [4]:
missing = pd.concat([mars_df.isnull().sum(), 100 * mars_df.isnull().mean()], axis=1)
missing.columns = ['Count', '%']
missing = missing.sort_values(by='%', ascending=False)
print(missing)

                     Count         %
UV_Radiation            27  0.844542
earth_date_time          0  0.000000
mars_date_time           0  0.000000
sol_number               0  0.000000
max_ground_temp(°C)      0  0.000000
min_ground_temp(°C)      0  0.000000
max_air_temp(°C)         0  0.000000
min_air_temp(°C)         0  0.000000
mean_pressure(Pa)        0  0.000000
wind_speed(m/h)          0  0.000000
humidity(%)              0  0.000000
sunrise                  0  0.000000
sunset                   0  0.000000
weather                  0  0.000000


In [5]:
# Check for missing values and "Value not available"
missing_or_unavailable = pd.concat([
    mars_df.isnull().sum(),  # Count of NaN values
    mars_df.eq("Value not available").sum(),  # Count of "Value not available"
    100 * (mars_df.isnull().mean() + mars_df.eq("Value not available").mean())  # Percentage of missing or unavailable
], axis=1)

# Rename columns
missing_or_unavailable.columns = ['NaN Count', '"Value not available" Count', 'Total % Missing or Unavailable']

# Sort by percentage of missing or unavailable values
missing_or_unavailable = missing_or_unavailable.sort_values(by='Total % Missing or Unavailable', ascending=False)

# Display the result
print(missing_or_unavailable)


                     NaN Count  "Value not available" Count  \
wind_speed(m/h)              0                         3197   
humidity(%)                  0                         3197   
max_air_temp(°C)             0                           29   
min_air_temp(°C)             0                           29   
max_ground_temp(°C)          0                           28   
min_ground_temp(°C)          0                           28   
mean_pressure(Pa)            0                           27   
UV_Radiation                27                            0   
weather                      0                            3   
earth_date_time              0                            0   
mars_date_time               0                            0   
sol_number                   0                            0   
sunrise                      0                            0   
sunset                       0                            0   

                     Total % Missing or Unavailable  


In [6]:
# Check the number of unique values in a specific column
unique_values_count = mars_df.nunique()
print(f"Number of unique values in 'column_name': {unique_values_count}")


Number of unique values in 'column_name': earth_date_time        3197
mars_date_time          360
sol_number             3197
max_ground_temp(°C)      52
min_ground_temp(°C)      40
max_air_temp(°C)         51
min_air_temp(°C)         78
mean_pressure(Pa)       224
wind_speed(m/h)           1
humidity(%)               1
sunrise                  90
sunset                   96
UV_Radiation              4
weather                   2
dtype: int64


In [7]:
print("Unique UV Radiation Readings: ", mars_df['UV_Radiation'].unique())
print("Unique Weather Readings: ", mars_df['weather'].unique())
print("Unique Wind Speed Readings: ", mars_df['wind_speed(m/h)'].unique())
print("Unique Humidity Readings: ", mars_df['humidity(%)'].unique())

Unique UV Radiation Readings:  ['moderate' 'low' 'high' 'very_high' nan]
Unique Weather Readings:  ['Sunny' 'Value not available']
Unique Wind Speed Readings:  ['Value not available']
Unique Humidity Readings:  ['Value not available']


In [8]:
columns_to_drop = ['wind_speed(m/h)', 'humidity(%)', 'weather']
mars_df.drop(columns=columns_to_drop, inplace=True)

In [9]:
mars_df.head()

print(check_missing(mars_df))

                     NaN Count  "Value not available" Count  \
max_air_temp(°C)             0                           29   
min_air_temp(°C)             0                           29   
max_ground_temp(°C)          0                           28   
min_ground_temp(°C)          0                           28   
mean_pressure(Pa)            0                           27   
UV_Radiation                27                            0   
earth_date_time              0                            0   
mars_date_time               0                            0   
sol_number                   0                            0   
sunrise                      0                            0   
sunset                       0                            0   

                     Total % Missing or Unavailable  
max_air_temp(°C)                           0.907100  
min_air_temp(°C)                           0.907100  
max_ground_temp(°C)                        0.875821  
min_ground_temp(°C)        

In [None]:
# Reformat sol_number and set as index