In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from scipy import stats
import pickle

# Set styling for plots
sns.set_style("whitegrid")
plt.rcParams.update({'font.size': 12, 'axes.labelsize': 14, 'axes.titlesize': 16})
warnings.filterwarnings('ignore')

In [3]:
# Load datasets
print("Loading datasets...")
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

Loading datasets...


In [4]:
# Display basic info
print(f"Train data shape: {df_train.shape}")
print(f"Test data shape: {df_test.shape}")

# Check missing values
print("\n--- Missing Values ---")
print(df_train.isnull().sum())
print("\n--- Duplicate Rows ---")
print(f"Number of duplicate rows: {df_train.duplicated().sum()}")

Train data shape: (2190, 13)
Test data shape: (730, 12)

--- Missing Values ---
id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64

--- Duplicate Rows ---
Number of duplicate rows: 0


In [5]:
# Statistical summary
print("\n--- Statistical Summary ---")
print(df_train.describe())


--- Statistical Summary ---
                id          day     pressure      maxtemp  temparature  \
count  2190.000000  2190.000000  2190.000000  2190.000000  2190.000000   
mean   1094.500000   179.948402  1013.602146    26.365799    23.953059   
std     632.342866   105.203592     5.655366     5.654330     5.222410   
min       0.000000     1.000000   999.000000    10.400000     7.400000   
25%     547.250000    89.000000  1008.600000    21.300000    19.300000   
50%    1094.500000   178.500000  1013.000000    27.800000    25.500000   
75%    1641.750000   270.000000  1017.775000    31.200000    28.400000   
max    2189.000000   365.000000  1034.600000    36.000000    31.500000   

           mintemp     dewpoint     humidity        cloud     sunshine  \
count  2190.000000  2190.000000  2190.000000  2190.000000  2190.000000   
mean     22.170091    20.454566    82.036530    75.721918     3.744429   
std       5.059120     5.288406     7.800654    18.026498     3.626327   
min     

In [6]:
# Target variable distribution
print("\n--- Target Variable Distribution ---")
print(df_train["rainfall"].value_counts(normalize=True) * 100)


--- Target Variable Distribution ---
rainfall
1    75.342466
0    24.657534
Name: proportion, dtype: float64


In [7]:
# Function to identify outliers using IQR method
def identify_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Identify numerical features
numerical_features = ['pressure', 'maxtemp', 'temparature', 'mintemp',
                      'dewpoint', 'humidity', 'cloud', 'sunshine', 
                      'winddirection', 'windspeed']

In [8]:
# Check for outliers in each numerical feature
print("\n--- Outlier Analysis ---")
outlier_stats = {}
for feature in numerical_features:
    outliers, lower, upper = identify_outliers(df_train, feature)
    outlier_percent = len(outliers) / len(df_train) * 100
    outlier_stats[feature] = {
        "count": len(outliers),
        "percentage": outlier_percent,
        "lower_bound": lower,
        "upper_bound": upper
    }
    print(f"{feature}: {len(outliers)} outliers ({outlier_percent:.2f}%)")


--- Outlier Analysis ---
pressure: 4 outliers (0.18%)
maxtemp: 0 outliers (0.00%)
temparature: 0 outliers (0.00%)
mintemp: 1 outliers (0.05%)
dewpoint: 26 outliers (1.19%)
humidity: 28 outliers (1.28%)
cloud: 129 outliers (5.89%)
sunshine: 0 outliers (0.00%)
winddirection: 0 outliers (0.00%)
windspeed: 28 outliers (1.28%)


In [10]:
# Feature correlation analysis
print("\n--- Feature Correlation Analysis ---")
correlation_matrix = df_train[numerical_features + ['rainfall']].corr()
high_corr_features = []

# Find highly correlated feature pairs (excluding self-correlations)
print("Highly correlated feature pairs:")
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:  # Threshold for high correlation
            print(f"{correlation_matrix.columns[i]} & {correlation_matrix.columns[j]}: {correlation_matrix.iloc[i, j]:.3f}")
            high_corr_features.append((correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j]))



--- Feature Correlation Analysis ---
Highly correlated feature pairs:
pressure & maxtemp: -0.800
pressure & temparature: -0.817
pressure & mintemp: -0.814
pressure & dewpoint: -0.817
maxtemp & temparature: 0.983
maxtemp & mintemp: 0.966
maxtemp & dewpoint: 0.907
temparature & mintemp: 0.987
temparature & dewpoint: 0.934
mintemp & dewpoint: 0.941
cloud & sunshine: -0.805


In [11]:
# Feature correlation with target
target_correlation = correlation_matrix['rainfall'].sort_values(ascending=False)
print("\nFeature correlation with target (rainfall):")
print(target_correlation)


Feature correlation with target (rainfall):
rainfall         1.000000
cloud            0.641191
humidity         0.454213
windspeed        0.111625
dewpoint         0.081965
winddirection   -0.006939
mintemp         -0.026841
temparature     -0.049660
pressure        -0.049886
maxtemp         -0.079304
sunshine        -0.555287
Name: rainfall, dtype: float64


In [12]:
# Data preprocessing function
def preprocess_data(df, is_train=True):
    # Create a copy to avoid modifying the original dataframe
    processed_df = df.copy()
    
    # Handle outliers (for training data only)
    if is_train:
        for feature in numerical_features:
            stats = outlier_stats[feature]
            # Cap outliers at bounds instead of removing them
            processed_df[feature] = np.where(
                processed_df[feature] < stats["lower_bound"],
                stats["lower_bound"],
                processed_df[feature]
            )
            processed_df[feature] = np.where(
                processed_df[feature] > stats["upper_bound"],
                stats["upper_bound"],
                processed_df[feature]
            )
    
    # Standardize numerical features
    scaler = StandardScaler()
    if is_train:
        processed_df[numerical_features] = scaler.fit_transform(processed_df[numerical_features])
        # Save the scaler for later use with test data
        with open("scaler.pkl", "wb") as f:
            pickle.dump(scaler, f)
    else:
        # Load the scaler fitted on training data
        try:
            with open("scaler.pkl", "rb") as f:
                scaler = pickle.load(f)
            processed_df[numerical_features] = scaler.transform(processed_df[numerical_features])
        except FileNotFoundError:
            print("Scaler file not found. Using new scaler instance.")
            processed_df[numerical_features] = scaler.fit_transform(processed_df[numerical_features])
    
    return processed_df

In [13]:
# Preprocess the data
print("\n--- Preprocessing Data ---")
df_train_processed = preprocess_data(df_train, is_train=True)
df_test_processed = preprocess_data(df_test, is_train=False)


--- Preprocessing Data ---


In [14]:
# Feature selection using Random Forest feature importance
print("\n--- Feature Selection using Random Forest ---")
X = df_train_processed[numerical_features]
y = df_train['rainfall']


--- Feature Selection using Random Forest ---


In [17]:
# Train a Random Forest to assess feature importance
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selector.fit(X, y)

# Get feature importances
importances = rf_selector.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': numerical_features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance)

         Feature  Importance
6          cloud    0.291328
7       sunshine    0.186434
5       humidity    0.097538
4       dewpoint    0.067204
1        maxtemp    0.066039
0       pressure    0.065979
9      windspeed    0.063994
3        mintemp    0.060184
2    temparature    0.057388
8  winddirection    0.043913


In [18]:
# Select top features (above certain threshold)
importance_threshold = 0.05  # Features with importance > 5%
selected_features = feature_importance[feature_importance['Importance'] > importance_threshold]['Feature'].tolist()
print(f"\nSelected features: {selected_features}")

# Prepare final datasets with selected features
X_train = df_train_processed[selected_features]
y_train = df_train['rainfall']
X_test = df_test_processed[selected_features]


Selected features: ['cloud', 'sunshine', 'humidity', 'dewpoint', 'maxtemp', 'pressure', 'windspeed', 'mintemp', 'temparature']


In [19]:
# Split training data into train and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print("\n--- Final datasets shapes ---")
print(f"Training data: {X_train_split.shape}")
print(f"Validation data: {X_val.shape}")
print(f"Test data: {X_test.shape}")


--- Final datasets shapes ---
Training data: (1752, 9)
Validation data: (438, 9)
Test data: (730, 9)


In [20]:
# Save processed datasets for model training
X_train.to_csv("processed_X_train.csv", index=False)
pd.DataFrame(y_train).to_csv("processed_y_train.csv", index=False)
X_test.to_csv("processed_X_test.csv", index=False)


In [21]:
# Save test IDs for submission file creation
test_ids = df_test['id']
test_ids.to_csv("test_ids.csv", index=False)

print("\nPreprocessed data saved to CSV files.")


Preprocessed data saved to CSV files.
