In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
file_path = 'daily_data.csv'
data = pd.read_csv(file_path)

In [2]:
data.head()

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset
0,D0001,C001,27.0,,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,06:04 AM,07:19 PM
1,D0002,C001,22.0,,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,06:05 AM,07:18 PM
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,1,06:05 AM,07:18 PM
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,17.0,10.0,1.0,11.2,1,06:06 AM,07:16 PM
4,D0005,C001,18.0,,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,06:07 AM,07:15 PM


In [3]:
data.isnull().sum()

day_id                         0
city_id                        0
temperature_celsius            0
condition_text              2414
wind_kph                       0
wind_degree                    0
pressure_mb                    0
precip_mm                      0
humidity                       0
cloud                          0
feels_like_celsius             0
visibility_km                  0
uv_index                       0
gust_kph                       0
air_quality_us-epa-index       0
sunrise                        0
sunset                         0
dtype: int64

In [4]:
data['sunrise'] = pd.to_datetime(data['sunrise'], format='%I:%M %p')
data['sunset'] = pd.to_datetime(data['sunset'], format='%I:%M %p')
data['sunrise_month'] = data['sunrise'].dt.month
data['sunrise_day'] = data['sunrise'].dt.day
data['sunset_month'] = data['sunset'].dt.month
data['sunset_day'] = data['sunset'].dt.day

# Calculate day length in hours
data['day_length'] = (data['sunset'] - data['sunrise']).dt.seconds / 3600.0

In [5]:
data.head()

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,...,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset,sunrise_month,sunrise_day,sunset_month,sunset_day,day_length
0,D0001,C001,27.0,,6.1,210,1006.0,0.0,54,75,...,6.0,11.9,2,1900-01-01 06:04:00,1900-01-01 19:19:00,1,1,1,1,13.25
1,D0002,C001,22.0,,6.1,170,1006.0,0.0,73,75,...,1.0,23.4,1,1900-01-01 06:05:00,1900-01-01 19:18:00,1,1,1,1,13.216667
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,...,1.0,12.6,1,1900-01-01 06:05:00,1900-01-01 19:18:00,1,1,1,1,13.216667
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,...,1.0,11.2,1,1900-01-01 06:06:00,1900-01-01 19:16:00,1,1,1,1,13.166667
4,D0005,C001,18.0,,3.6,92,1019.0,0.0,94,0,...,1.0,9.0,1,1900-01-01 06:07:00,1900-01-01 19:15:00,1,1,1,1,13.133333


In [6]:
label_encoder_city = LabelEncoder()
data['city_id_encoded'] = label_encoder_city.fit_transform(data['city_id'])

# Recalculate features list with new features
features = [
    'temperature_celsius', 'wind_kph', 'wind_degree', 'pressure_mb', 
    'precip_mm', 'humidity', 'cloud', 'feels_like_celsius', 'visibility_km', 
    'uv_index', 'gust_kph', 'air_quality_us-epa-index', 
    'sunrise_month', 'sunrise_day', 'sunset_month', 'sunset_day', 'day_length',
    'city_id_encoded'
]

In [7]:
data_with_condition = data.dropna(subset=['condition_text'])
data_missing_condition = data[data['condition_text'].isna()]

# Encode the condition_text labels
label_encoder_condition = LabelEncoder()
data_with_condition['condition_text_encoded'] = label_encoder_condition.fit_transform(data_with_condition['condition_text'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_with_condition['condition_text_encoded'] = label_encoder_condition.fit_transform(data_with_condition['condition_text'])


In [8]:
data_with_condition.head()

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,...,air_quality_us-epa-index,sunrise,sunset,sunrise_month,sunrise_day,sunset_month,sunset_day,day_length,city_id_encoded,condition_text_encoded
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,...,1,1900-01-01 06:05:00,1900-01-01 19:18:00,1,1,1,1,13.216667,0,3
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,...,1,1900-01-01 06:06:00,1900-01-01 19:16:00,1,1,1,1,13.166667,0,0
6,D0007,C001,21.0,Partly Cloudy,4.0,310,1015.0,0.0,100,50,...,2,1900-01-01 06:08:00,1900-01-01 19:11:00,1,1,1,1,13.05,0,6
18,D0019,C001,19.0,Clear and Sunny,3.6,64,1017.0,0.0,88,0,...,3,1900-01-01 06:20:00,1900-01-01 18:51:00,1,1,1,1,12.516667,0,0
27,D0028,C002,19.0,Partly Cloudy,3.6,83,1010.0,0.0,73,25,...,1,1900-01-01 06:17:00,1900-01-01 19:20:00,1,1,1,1,13.05,1,6


In [9]:
# Prepare training data
X = data_with_condition[features]
y = data_with_condition['condition_text_encoded']

In [10]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Create and train a new XGBoost model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', xgb.XGBClassifier(random_state=42))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 4, 5],
    'classifier__min_child_weight': [1, 2],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X, y)

# Get the best estimator
best_model = grid_search.best_estimator_

# Predict and evaluate the model
y_pred = best_model.predict(X)
print(classification_report(y, y_pred, target_names=label_encoder_condition.classes_))

Fitting 3 folds for each of 144 candidates, totalling 432 fits
                         precision    recall  f1-score   support

        Clear and Sunny       0.95      1.00      0.97       122
    Cloudy and Overcast       0.91      0.75      0.82        53
    Light Precipitation       0.86      0.74      0.79        68
Light Rain with Thunder       1.00      0.12      0.22        16
            Mist or Fog       0.98      0.82      0.89        50
 Moderate to Heavy Rain       1.00      0.58      0.74        12
          Partly Cloudy       0.66      0.98      0.79       122
           Rain Showers       1.00      0.52      0.69        21
          Thunderstorms       1.00      0.20      0.33        15

               accuracy                           0.83       479
              macro avg       0.93      0.64      0.69       479
           weighted avg       0.87      0.83      0.81       479



In [11]:
X_missing = data_missing_condition[features]
data_missing_condition['condition_text_encoded'] = best_model.predict(X_missing)
data_missing_condition['condition_text'] = label_encoder_condition.inverse_transform(data_missing_condition['condition_text_encoded'])

# Combine the datasets
data_combined = pd.concat([data_with_condition, data_missing_condition.drop(columns=['condition_text_encoded'])], ignore_index=True)
data_combined = data_combined.sort_values(by='day_id')
data_combined[['day_id', 'condition_text']].to_csv('submission_c_11.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_missing_condition['condition_text_encoded'] = best_model.predict(X_missing)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_missing_condition['condition_text'] = label_encoder_condition.inverse_transform(data_missing_condition['condition_text_encoded'])
