### 0.87089 ROC-AUC 
score on https://www.kaggle.com/competitions/playground-series-s5e3/overview

```catboost(fine tuning)```

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [None]:
train_data.head()
# Change maxtemp, mintemp and temperature colummns to one

In [None]:
train_data.info()

In [None]:
sns.heatmap(train_data.corr(), annot=True, fmt='.2f', cmap='coolwarm')

In [None]:
class Preproccessing:
    def __init__(self, data):
        self.data = data

    def transform(self, columns_to_drop=None) -> pd.DataFrame:
        self.data["temp_diff"] = self.data["maxtemp"] - self.data["mintemp"]

        if columns_to_drop:
            self.data = self.data.drop(columns=columns_to_drop, axis=1)

        return self.data


In [None]:
train_data = Preproccessing(train_data).transform(columns_to_drop=["maxtemp", "mintemp", "dewpoint", "id", "day"])

In [None]:
train_data.head()

In [None]:
sns.heatmap(train_data.corr(), annot=True, fmt='.2f', cmap='coolwarm')

In [None]:
#check target distribution
plt.hist(train_data['rainfall'], bins=3)

In [None]:
sns.pairplot(train_data, hue="rainfall")

Target distributed unevenly, there are not uniform distribution, no linear dependencies

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(20, 10))

columns = train_data.columns.values
print(columns)
for i in range(9):
    sns.boxplot(train_data[columns[i]], ax=axs[i%2, i%5])
    axs[i%2, i%5].set_title(columns[i])
plt.show()

Columns with emissions: cloud, pressure, windspeed, humidity, temp_dif

In [None]:
# train_data.loc[train_data.pressure > 1030, "pressure"] = train_data.loc[train_data.pressure > 1030, "pressure"].(train_data.pressure.mean())
train_data.loc[train_data.pressure > 1030, "pressure"] = train_data.pressure.mean()
train_data.loc[train_data.cloud < 20, "cloud"] = train_data.cloud.mean()
train_data.loc[train_data.windspeed > 47, "windspeed"] = train_data.windspeed.mean()
train_data.loc[train_data.humidity < 63, "humidity"] = train_data.humidity.mean()
train_data.loc[train_data.temp_diff > 8, "temp_diff"] = train_data.temp_diff.mean()
train_data.loc[train_data.temp_diff < 0.5, "temp_diff"] = train_data.temp_diff.mean()

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(20, 10))

columns = train_data.columns.values
print(columns)
for i in range(9):
    sns.boxplot(train_data[columns[i]], ax=axs[i%2, i%5])
    axs[i%2, i%5].set_title(columns[i])
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, f1_score


In [None]:
test_data = Preproccessing(test_data).transform(columns_to_drop=["maxtemp", "mintemp", "dewpoint", "id", "day"])

In [None]:
from collections import Counter

counts = Counter(train_data['rainfall'])
majority = max(counts.values())
weights = [majority / counts[c] for c in sorted(counts)]
weights

In [None]:
catboost = CatBoostClassifier(iterations=1000, loss_function='Logloss', verbose=0, class_weights=weights, random_seed=42)

In [None]:
params = {
    'learning_rate': [1e-2, 1e-1, 0.3],
    'depth': np.linspace(5, 9, 5, dtype=int),
    'l2_leaf_reg': [1e-1, 1, 10]
}

In [None]:
catboost.grid_search(
    param_grid=params,
    X=train_data.drop(columns=['rainfall']),
    y=train_data['rainfall'],
    cv=5,
    plot=True,
    verbose=False,
    train_size=0.8
)

In [None]:
catboost.fit(train_data.drop(columns=['rainfall']), train_data['rainfall'], verbose=0)

In [None]:
preds = catboost.predict(train_data.drop(columns=['rainfall']))

In [None]:
score = roc_auc_score(train_data['rainfall'], preds)

In [None]:
score

In [None]:
prediction = catboost.predict_proba(test_data)

In [None]:
my_submission = pd.read_csv('data/sample_submission.csv')
my_submission.rainfall = prediction[:, 1]
my_submission.to_csv('data/submission.csv', index=False)