In [1]:
# imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
# loading and visualizing dataset
dataset = pd.read_csv('/kaggle/input/cloud-coverage-detection/train.csv', dtype={'DATE (MM/DD)': 'str', 'MST': 'str'})
dataset.head()

Unnamed: 0,DATE (MM/DD),MST,Global CMP22 (vent/cor) [W/m^2],Direct sNIP [W/m^2],Azimuth Angle [degrees],Tower Dry Bulb Temp [deg C],Tower Wet Bulb Temp [deg C],Tower Dew Point Temp [deg C],Tower RH [%],Total Cloud Cover [%],Peak Wind Speed @ 6ft [m/s],Avg Wind Direction @ 6ft [deg from N],Station Pressure [mBar],Precipitation (Accumulated) [mm],Snow Depth [cm],Moisture,Albedo (CMP11)
0,01-Jan,00:00,-0.962276,0.0,356.8564,7.216,0.988,-7.312,32.33,0.0,9.95,271.3,806.779,0.0,0.219,0.0,0.0
1,01-Jan,00:01,-0.937921,0.0,357.65505,7.251,1.04,-7.26,32.4,0.0,8.2,272.9,806.84,0.0,0.206,0.0,0.0
2,01-Jan,00:02,-0.944395,0.0,358.45438,7.256,1.093,-7.207,32.54,0.0,6.7,288.8,806.876,0.0,0.148,0.0,0.0
3,01-Jan,00:03,-0.95135,-0.029673,359.25416,7.254,1.06,-7.44,31.89,0.0,7.7,294.0,806.823,0.0,0.235,0.0,0.0
4,01-Jan,00:04,-0.934976,-0.054401,0.05415,7.331,1.081,-7.419,31.78,0.0,7.2,285.5,806.762,0.0,0.182,0.0,0.0


In [3]:
# check missing values in each column
missing_counts = dataset.isnull().sum()
print(missing_counts)

DATE (MM/DD)                             44640
MST                                      44640
Global CMP22 (vent/cor) [W/m^2]          44640
Direct sNIP [W/m^2]                      44640
Azimuth Angle [degrees]                  44640
Tower Dry Bulb Temp [deg C]              44640
Tower Wet Bulb Temp [deg C]              44640
Tower Dew Point Temp [deg C]             44640
Tower RH [%]                             44640
Total Cloud Cover [%]                    46109
Peak Wind Speed @ 6ft [m/s]              44640
Avg Wind Direction @ 6ft [deg from N]    44640
Station Pressure [mBar]                  44640
Precipitation (Accumulated) [mm]         44640
Snow Depth [cm]                          44640
Moisture                                 44640
Albedo (CMP11)                           44640
dtype: int64


In [4]:
# defining X,y and train, test split
cols_to_use = ['Global CMP22 (vent/cor) [W/m^2]', 'Direct sNIP [W/m^2]', 'Tower Dry Bulb Temp [deg C]', 
               'Tower Wet Bulb Temp [deg C]', 'Tower Dew Point Temp [deg C]', 'Tower RH [%]',
               'Peak Wind Speed @ 6ft [m/s]', 'Avg Wind Direction @ 6ft [deg from N]', 'Station Pressure [mBar]',
               'Precipitation (Accumulated) [mm]', 'Snow Depth [cm]', 'Albedo (CMP11)']

X = dataset[cols_to_use]
y = dataset['Total Cloud Cover [%]']

# calculate median excluding NaN values
median_y = y.median()

# impute NaN values with median
y_imputed = y.fillna(median_y)

X_train, X_valid, y_train, y_valid = train_test_split(X, y_imputed, train_size=0.8, test_size=0.2, random_state=50)

In [5]:
# data processing
from sklearn.impute import SimpleImputer

# imputation
imputer = SimpleImputer(strategy='median')
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid =pd.DataFrame(imputer.transform(X_valid))

# imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [None]:
# gradient boosted model
from xgboost import XGBRegressor

my_model = XGBRegressor()
my_model.fit(imputed_X_train, y_train)