In [7]:
import pandas as pd  # type: ignore
import matplotlib.pyplot as plt  # type: ignore
import numpy as np  # type: ignore

In [8]:
df = pd.read_csv('./../processed_data/filtered_data_with_classes.csv')
print(df.head(1))

  day_id city_id  temperature_celsius  condition_text  wind_kph  wind_degree  \
0  D0003    C001              0.46875               0       0.0     0.016854   

   pressure_mb  precip_mm  humidity  cloud  feels_like_celsius  visibility_km  \
0     0.470588   0.229592       1.0   0.75            0.371257       0.308176   

   uv_index  gust_kph  air_quality_us-epa-index  sunrise_num  sunset_num  
0       0.0  0.187697                       0.0     0.302469    0.527638  


In [9]:
from sklearn.preprocessing import LabelEncoder  # type: ignore
from sklearn.model_selection import train_test_split  # type: ignore
from sklearn.metrics import accuracy_score  # type: ignore
from xgboost import XGBClassifier  # type: ignore
from sklearn.model_selection import train_test_split

In [10]:
# Encoding categorical variables (if 'condition_text' is categorical)
label_encoder = LabelEncoder()
df['condition_text'] = label_encoder.fit_transform(df['condition_text'])
df['city_id'] = label_encoder.fit_transform(df['city_id'])

# Separating features and target variable
X = df.drop(['condition_text', "day_id"], axis=1)  # Features
y = df['condition_text']  # Target

# Step 2: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42)

In [16]:
model = XGBClassifier(
    eval_metric='mlogloss',
    learning_rate=0.01,
    max_depth=5,
    n_estimators=200,
    subsample=0.9,
    gamma=0,
    min_child_weight=1,
    reg_alpha=0.09,
    reg_lambda=0.85,
    colsample_bytree=0.8,
)

model.fit(X_train, y_train)

{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.8, 'device': None, 'eval_metric': 'mlogloss', 'gamma': 0, 'grow_policy': None, 'interaction_constraints': None, 'learning_rate': 0.01, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 5, 'max_leaves': None, 'min_child_weight': 1, 'monotone_constraints': None, 'multi_strategy': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': 0.09, 'reg_lambda': 0.85, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.9, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


In [14]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier(
    eval_metric='mlogloss',
    learning_rate=0.01,
    max_depth=5,
    n_estimators=200,
    subsample=0.9,
    gamma=0,
    min_child_weight=1,
    colsample_bytree=0.8,
    reg_alpha=0.2,
)

# the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features used per tree
    'min_child_weight': [1, 2, 3], # Minimum sum of instance weight needed in a child
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0.08, 0.09, 0.15, 0.2, 0.25],  # L1 regularization term on weights
    'reg_lambda': [0.9, 0.85, 0.8, 0.75]  # L2 regularization term on weights
}

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


KeyboardInterrupt: 

In [17]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8125


In [8]:
df_input = pd.read_csv('./../processed_data/filtered_input.csv')
print(df_input.head(1))

  day_id city_id  temperature_celsius  wind_kph  wind_degree  pressure_mb  \
0  D0001    C001             0.617021  0.035411     0.582173     0.444444   

   precip_mm  humidity  cloud  feels_like_celsius  visibility_km  uv_index  \
0        0.0  0.510638   0.75            0.414634         0.3125  0.555556   

   gust_kph  air_quality_us-epa-index  sunrise_num  sunset_num  
0  0.149533                       0.2     0.311765    0.542714  


In [9]:
df_input_processed = df_input.drop(['day_id'], axis=1)
df_input_processed['city_id'] = label_encoder.fit_transform(df_input_processed['city_id'])

# Make predictions (assuming model is already trained and available)
# predictions = model.predict(df_input_processed)

In [10]:
# probabilities = model.predict_proba(df_input_processed)
# predicted_classes = np.argmax(probabilities, axis=1)

In [11]:
lower_bound = 0.6
upper_bound = 0.8
step = 0.05

# Generate confidence levels
confidence_levels = np.arange(lower_bound, upper_bound + step, step).tolist()
# confidence_levels = [95]

for confidence_threshold in confidence_levels:
    probabilities = model.predict_proba(df_input_processed)
    predicted_classes = np.argmax(probabilities, axis=1)
    # Filter records based on the current confidence threshold
    max_probabilities = np.max(probabilities, axis=1)
    sure_indices = np.where(max_probabilities >= confidence_threshold)[0]
    print("sure count", sure_indices.shape)
    sure_records_df = df_input_processed.iloc[sure_indices].copy()
    sure_labels = predicted_classes[sure_indices]
    sure_records_df['predicted_labels'] = sure_labels

    # Split the new dataset into training and testing subsets
    X_sure = sure_records_df.drop('predicted_labels', axis=1)
    y_sure = sure_records_df['predicted_labels']

    # Merge with previous training data
    X_added = pd.concat([X_train, X_sure], ignore_index=True)
    y_added = pd.concat([y_train, y_sure], ignore_index=True)

    # Retrain the model on this new training subset
    model_retrained = XGBClassifier(
        eval_metric='mlogloss',
        learning_rate=0.01,
        max_depth=5,
        n_estimators=200,
        subsample=0.9,
        gamma=0,
        min_child_weight=1,
        reg_alpha=0.09,
        reg_lambda=0.85,
        colsample_bytree=0.8,
    )
    model_retrained.fit(X_added, y_added)
    y_pred = model_retrained.predict(X_test)
    # Evaluate and print accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Confidence Threshold: {confidence_threshold: .2f}, Accuracy: {accuracy:.4f}")

sure count (1471,)
Confidence Threshold:  0.60, Accuracy: 0.8125
sure count (1204,)
Confidence Threshold:  0.65, Accuracy: 0.8125
sure count (828,)
Confidence Threshold:  0.70, Accuracy: 0.8125
sure count (470,)
Confidence Threshold:  0.75, Accuracy: 0.8125
sure count (0,)
Confidence Threshold:  0.80, Accuracy: 0.8125
sure count (0,)
Confidence Threshold:  0.85, Accuracy: 0.8125


In [12]:
confidence_threshold = 0.65
probabilities = model.predict_proba(df_input_processed)
predicted_classes = np.argmax(probabilities, axis=1)
max_probabilities = np.max(probabilities, axis=1)

sure_indices = np.where(max_probabilities >= confidence_threshold)[0]
print("sure count", sure_indices.shape)
sure_records_df = df_input_processed.iloc[sure_indices].copy()
sure_labels = predicted_classes[sure_indices]
sure_records_df['predicted_labels'] = sure_labels

# Split the new dataset into training and testing subsets
X_sure = sure_records_df.drop('predicted_labels', axis=1)
y_sure = sure_records_df['predicted_labels']

# Merge with previous training data
X_added = pd.concat([X, X_sure], ignore_index=True)
y_added = pd.concat([y, y_sure], ignore_index=True)

# Retrain the model on this new training subset
model_retrained = XGBClassifier(
    eval_metric='mlogloss',
    learning_rate=0.01,
    max_depth=5,
    n_estimators=200,
    subsample=0.9,
    gamma=0,
    min_child_weight=1,
    reg_alpha=0.09,
    reg_lambda=0.85,
    colsample_bytree=0.8,
)
model_retrained.fit(X_added, y_added)
y_pred = model_retrained.predict(X_test)

# Evaluate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Confidence Threshold: {confidence_threshold: .2f}, Accuracy: {accuracy:.4f}")

sure count (1204,)
Confidence Threshold:  0.65, Accuracy: 0.9167


In [13]:
y_pred = model_retrained.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9167


In [14]:
print(sure_records_df.count())
print(df_input_processed.count())

city_id                     1204
temperature_celsius         1204
wind_kph                    1204
wind_degree                 1204
pressure_mb                 1204
precip_mm                   1204
humidity                    1204
cloud                       1204
feels_like_celsius          1204
visibility_km               1204
uv_index                    1204
gust_kph                    1204
air_quality_us-epa-index    1204
sunrise_num                 1204
sunset_num                  1204
predicted_labels            1204
dtype: int64
city_id                     2414
temperature_celsius         2414
wind_kph                    2414
wind_degree                 2414
pressure_mb                 2414
precip_mm                   2414
humidity                    2414
cloud                       2414
feels_like_celsius          2414
visibility_km               2414
uv_index                    2414
gust_kph                    2414
air_quality_us-epa-index    2414
sunrise_num                 24

In [15]:
import json
with open('./../unique_dict.json', 'r') as file:
    prediction_encoder = json.load(file)

print(prediction_encoder)
prediction_decoder = {v: k for k, v in prediction_encoder.items()}
print(prediction_decoder)

{'Light Rain with Thunder': 0, 'Clear and Sunny': 1, 'Partly Cloudy': 2, 'Light Precipitation': 3, 'Cloudy and Overcast': 4, 'Mist or Fog': 5, 'Rain Showers': 6, 'Moderate to Heavy Rain': 7, 'Thunderstorms': 8}
{0: 'Light Rain with Thunder', 1: 'Clear and Sunny', 2: 'Partly Cloudy', 3: 'Light Precipitation', 4: 'Cloudy and Overcast', 5: 'Mist or Fog', 6: 'Rain Showers', 7: 'Moderate to Heavy Rain', 8: 'Thunderstorms'}


In [16]:
predictions = model_retrained.predict(df_input_processed)

In [17]:
predictions[:10]

array([2, 2, 1, 1, 2, 2, 2, 1, 1, 1], dtype=int64)

In [18]:
# Decode predictions (assuming you have a dict named 'prediction_decoder' for this purpose)
decoded_predictions = [prediction_decoder[pred] for pred in predictions]

# Add decoded predictions back to df_input
df_input['predictions'] = decoded_predictions
daily_data = pd.read_csv('./../data/daily_data.csv')
daily_data['condition_flag'] = daily_data['condition_text'].isna().astype(int)

temp_df = df_input[['city_id', 'day_id', 'predictions']].copy()

result = pd.merge(daily_data, temp_df, on=['city_id', 'day_id'], how='left')

result['condition_text'] = result['condition_text'].fillna(result['predictions'])

result.drop('predictions', axis=1, inplace=True)

In [19]:
result.to_csv('./../output/predictions_with_data.csv', index=False)

In [20]:
result.to_csv('./../output/predictions_with_data.csv', index=False)
result[["day_id", "condition_text"]].to_csv('./../output/submit.csv', index=False)


In [21]:
print(result["condition_text"].value_counts())

condition_text
Partly Cloudy              1241
Clear and Sunny            1217
Light Precipitation         167
Mist or Fog                 102
Cloudy and Overcast          94
Rain Showers                 24
Thunderstorms                17
Light Rain with Thunder      16
Moderate to Heavy Rain       15
Name: count, dtype: int64
