In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


class CityEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, drop='first'):
        self.drop = drop
        self.encoder = OneHotEncoder(drop=drop, sparse=False, handle_unknown='ignore')
        
    def fit(self, X, y=None):
        self.encoder.fit(X)
        return self
    
    def transform(self, X):
        return self.encoder.transform(X)
    
    def get_feature_names_out(self, input_features=None):
        return self.encoder.get_feature_names_out(input_features)


def Main(i):
    # Load the data
    df = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable-part-2/daily_data.csv')

    # Feature engineering
    df['sunrise_hour'] = pd.to_datetime(df['sunrise']).dt.hour + pd.to_datetime(df['sunrise']).dt.minute / 60
    df['sunset_hour'] = pd.to_datetime(df['sunset']).dt.hour + pd.to_datetime(df['sunset']).dt.minute / 60
    df['day_number'] = df['day_id'].str.extract('(\d+)').astype(int)
    df['day_of_week'] = df['day_number'] % 7
    df['season'] = pd.cut(df['day_number'] % 365, bins=[0, 90, 180, 270, 365], labels=['Winter', 'Spring', 'Summer', 'Fall'])

    # Interaction terms
    df['temp_humidity'] = df['temperature_celsius'] * df['humidity']
    df['wind_pressure'] = df['wind_kph'] * df['pressure_mb']
    
    #Features from Akindu
    df['temp_humidity_interaction'] = df['temperature_celsius'] - (0.4 * (1 - df['humidity'] / 100) * (df['temperature_celsius'] - 14))
    df['Rain'] = (df['cloud'] * 4 + (df['temperature_celsius'] - df['feels_like_celsius'] )* 3 + df['precip_mm'] * 4 - df['pressure_mb'] * 3 - df['visibility_km'] * 5 + df['air_quality_us-epa-index'] * 3)**2 * df['cloud'] 
    df['Air'] = df['precip_mm'] * df['air_quality_us-epa-index'] * 0.1
    df['windsp'] = df['wind_degree'] / df['air_quality_us-epa-index']
    df['Clear_Sky'] = df['visibility_km'] / ((df['uv_index'] *df['cloud'] * df['cloud'])+1)
    df['Not_Froggy'] = (df['gust_kph'] * df['visibility_km'] *df['wind_kph'])
    
    # Define a function to get the most frequent condition_text
    def most_frequent(x):
        if not x.dropna().empty:  # Check if there are non-NA values
            return x.mode()[0] if not x.mode().empty else np.nan
        else:
            return np.nan

    # Create the new column with the most frequent condition_text for each city_id
    df['most_frequent_condition_text_for_city'] = df.groupby('city_id')['condition_text'].transform(most_frequent)
    
    # Define a function to get the least frequent condition_text
    def least_frequent(x):
        if not x.dropna().empty:  # Check if there are non-NA values
            return x.value_counts().idxmin() if not x.value_counts().idxmin()=='nan' else np.nan
        else:
            return np.nan

    # Create the new column with the least frequent condition_text for each city_id
    df['least_frequent_condition_text_for_city'] = df.groupby('city_id')['condition_text'].transform(least_frequent)

    df['most_frequent_condition_text_for_city'] = df['most_frequent_condition_text_for_city'].replace('Clear and Sunny',0)
    df['most_frequent_condition_text_for_city'] = df['most_frequent_condition_text_for_city'].replace('Partly Cloudy',1)
    df['most_frequent_condition_text_for_city'] = df['most_frequent_condition_text_for_city'].replace('Light Precipitation',2)
    df['most_frequent_condition_text_for_city'] = df['most_frequent_condition_text_for_city'].replace('Cloudy and Overcast',3)
    df['most_frequent_condition_text_for_city'] = df['most_frequent_condition_text_for_city'].replace('Mist or Fog',4)
    df['most_frequent_condition_text_for_city'] = df['most_frequent_condition_text_for_city'].replace('Rain Showers',5)
    df['most_frequent_condition_text_for_city'] = df['most_frequent_condition_text_for_city'].replace('Light Rain with Thunder',6)
    df['most_frequent_condition_text_for_city'] = df['most_frequent_condition_text_for_city'].replace('Thunderstorms',7)
    df['most_frequent_condition_text_for_city'] = df['most_frequent_condition_text_for_city'].replace('Moderate to Heavy Rain',8)

    df['least_frequent_condition_text_for_city'] = df['least_frequent_condition_text_for_city'].replace('Clear and Sunny',0)
    df['least_frequent_condition_text_for_city'] = df['least_frequent_condition_text_for_city'].replace('Partly Cloudy',1)
    df['least_frequent_condition_text_for_city'] = df['least_frequent_condition_text_for_city'].replace('Light Precipitation',2)
    df['least_frequent_condition_text_for_city'] = df['least_frequent_condition_text_for_city'].replace('Cloudy and Overcast',3)
    df['least_frequent_condition_text_for_city'] = df['least_frequent_condition_text_for_city'].replace('Mist or Fog',4)
    df['least_frequent_condition_text_for_city'] = df['least_frequent_condition_text_for_city'].replace('Rain Showers',5)
    df['least_frequent_condition_text_for_city'] = df['least_frequent_condition_text_for_city'].replace('Light Rain with Thunder',6)
    df['least_frequent_condition_text_for_city'] = df['least_frequent_condition_text_for_city'].replace('Thunderstorms',7)
    df['least_frequent_condition_text_for_city'] = df['least_frequent_condition_text_for_city'].replace('Moderate to Heavy Rain',8)
    
     # Prepare features and target
    numerical_features = ['temperature_celsius', 'wind_kph', 'wind_degree', 'pressure_mb', 
                          'precip_mm', 'humidity', 'cloud', 'feels_like_celsius', 
                          'visibility_km', 'uv_index','gust_kph', 'air_quality_us-epa-index',
                          'sunrise_hour', 'sunset_hour', 'day_number','temp_humidity_interaction',
                          'Rain','Air','windsp','Clear_Sky','Not_Froggy']

    categorical_features = ['city_id','day_of_week' ,'season']

    all_features = numerical_features + categorical_features

    # Only use rows where condition_text is not null for training
    df_train = df.dropna(subset=['condition_text'])

    X = df_train[all_features]
    y = df_train['condition_text']

    # Encode the target variable
    le = LabelEncoder()
    y = le.fit_transform(y)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)

    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', CityEncoder(drop='first'), categorical_features)
        ])

    # Define the models
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=i),
        'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=i),
        'XGBoost': XGBClassifier(n_estimators=100, random_state=i),
        'LightGBM': LGBMClassifier(n_estimators=100, random_state=i)
    }

    # Train and evaluate each model
    best_model = None
    best_accuracy = 0

    for name, model in models.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        print(f"{name} Accuracy: {accuracy:.4f}")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = pipeline

    print(f"\nBest Model: {type(best_model.named_steps['classifier']).__name__}")
    print(f"Best Accuracy: {best_accuracy:.4f}")

    # Use the best model for predictions and evaluation
    y_pred = best_model.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # Predict for null values
    null_condition_rows = df[df['condition_text'].isnull()]
    predictions = best_model.predict(null_condition_rows[all_features])
    predicted_conditions = le.inverse_transform(predictions)

    # Update the original dataframe
    df.loc[df['condition_text'].isnull(), 'condition_text'] = predicted_conditions

    # Print a sample of the predictions
    print("\nSample of Predictions:")
    print(df[df['condition_text'].isin(predicted_conditions)].sample(10))

    # Feature importance (if the best model supports it)
    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        feature_names = (numerical_features + 
                         best_model.named_steps['preprocessor']
                         .named_transformers_['cat'].encoder
                         .get_feature_names_out(categorical_features).tolist())

        importances = best_model.named_steps['classifier'].feature_importances_
        for feature, importance in sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True):
            print(f"{feature}: {importance}")

    # Check predictions for C100 and C003
    print("\nPredictions for C100 and C003:")
    print(df[df['city_id'].isin(['C100', 'C003'])][['city_id', 'condition_text']].head(10))

    return df, best_accuracy

# Run the main function
df1, best_accuracy = Main(42)

# Uncomment the following loop if you want to find the best random state
# best_accuracy = 0
# best_df = None
# for i in range(1, 51):
#     df1, acc1 = Main(i)
#     if acc1 > best_accuracy:
#         best_accuracy = acc1
#         best_df = df1
#         print(f"New best accuracy: {best_accuracy:.4f} with random state {i}")

print(f"\nFinal Best Accuracy: {best_accuracy:.4f}")

  df['sunrise_hour'] = pd.to_datetime(df['sunrise']).dt.hour + pd.to_datetime(df['sunrise']).dt.minute / 60
  df['sunrise_hour'] = pd.to_datetime(df['sunrise']).dt.hour + pd.to_datetime(df['sunrise']).dt.minute / 60
  df['sunset_hour'] = pd.to_datetime(df['sunset']).dt.hour + pd.to_datetime(df['sunset']).dt.minute / 60
  df['sunset_hour'] = pd.to_datetime(df['sunset']).dt.hour + pd.to_datetime(df['sunset']).dt.minute / 60
  df['most_frequent_condition_text_for_city'] = df['most_frequent_condition_text_for_city'].replace('Moderate to Heavy Rain',8)
  df['least_frequent_condition_text_for_city'] = df['least_frequent_condition_text_for_city'].replace('Moderate to Heavy Rain',8)


RandomForest Accuracy: 0.7222




GradientBoosting Accuracy: 0.6944




XGBoost Accuracy: 0.7222
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 335, number of used features: 30
[LightGBM] [Info] Start training from score -1.325494
[LightGBM] [Info] Start training from score -2.203213
[LightGBM] [Info] Start training from score -2.029941
[LightGBM] [Info] Start training from score -3.329224
[LightGBM] [Info] Start training from score -2.258782
[LightGBM] [Info] Start training from score -3.616906
[LightGBM] [Info] Start training from score -1.359783
[LightGBM] [Info] Start training from score -3.249181
[LightGBM] [Info] Start training from score -3.511545
LightGBM Accuracy: 0.7222

Best Model: RandomForestClassifier
Best Accuracy: 0.7222

Classification Report:
                         precision    recall  f1-score   support

        Clear and Sunny     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
df1['condition_text'].value_counts()

condition_text
Partly Cloudy              1219
Clear and Sunny            1213
Light Precipitation         160
Cloudy and Overcast         107
Mist or Fog                 104
Rain Showers                 35
Light Rain with Thunder      20
Thunderstorms                20
Moderate to Heavy Rain       15
Name: count, dtype: int64

In [53]:
# Create a new DataFrame with only day_id and condition_text
df_output = df1[['day_id', 'condition_text']]

# Save the updated DataFrame to a new CSV file
output_filename = 'weather_predictions_submissions6_day3_without_removing_columns.csv'
df_output.to_csv(output_filename, index=False)
print(f"\nPredictions saved to {output_filename}")

# Print the first few rows of the saved data
print("\nFirst few rows of the saved data:")
print(df_output.head())


Predictions saved to weather_predictions_submissions6_day3_without_removing_columns.csv

First few rows of the saved data:
  day_id           condition_text
0  D0001            Partly Cloudy
1  D0002            Partly Cloudy
2  D0003  Light Rain with Thunder
3  D0004          Clear and Sunny
4  D0005          Clear and Sunny
