In [None]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# specify the date parsing format
date_format = '%m/%d'

# loading and visualizing dataset into a dataframe
df = pd.read_csv('/kaggle/input/cloud-coverage-detection/train.csv', index_col='DATE (MM/DD)', parse_dates=True, date_format=date_format, dtype={1:'str'})
df.head()

In [None]:
# plotting the cloud cover % wrt date (has missing values)
df['Total Cloud Cover [%]'].plot(kind='line')

In [None]:
# check missing values in each column
missing_counts = df.isnull().sum()
print(missing_counts)

In [None]:
# imputation of target feature and feature selection for inputs
cols_to_use = ['Global CMP22 (vent/cor) [W/m^2]', 'Direct sNIP [W/m^2]', 'Tower Dry Bulb Temp [deg C]', 
               'Tower Wet Bulb Temp [deg C]', 'Tower Dew Point Temp [deg C]', 'Tower RH [%]',
               'Peak Wind Speed @ 6ft [m/s]', 'Avg Wind Direction @ 6ft [deg from N]', 'Station Pressure [mBar]',
               'Precipitation (Accumulated) [mm]', 'Snow Depth [cm]', 'Albedo (CMP11)']

X = df[cols_to_use]
y = df['Total Cloud Cover [%]']

In [None]:
# train test split
len(df)

X_train = X.iloc[:398593]
X_test = X.iloc[398593:498241]
y_train = y.iloc[:398593]
y_test = y.iloc[398593:498241]

In [None]:
# calculate median excluding NaN values
median_y_train = y_train.median()
median_y_test = y_test.median()
# impute NaN values with median
y_train_imputed = y_train.fillna(median_y_train)
y_test_imputed = y_test.fillna(median_y_test)

In [None]:
# imports
from sklearn.impute import SimpleImputer

# imputing input features
imputer = SimpleImputer(strategy='median')
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(imputer.transform(X_test))

# imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

In [None]:
# plotting the cloud cover % wrt date (after imputations)
y_train.plot(x=imputed_X_train, kind='line')

In [None]:
# detail extraction (https://www.statsmodels.org/stable/generated/statsmodels.tsa.seasonal.seasonal_decompose.html)
from statsmodels.tsa.seasonal import seasonal_decompose