In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [3]:
train = pd.read_csv(r'c:\Users\omarf\Downloads\playground-series-s5e3\train.csv')
train.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2190 entries, 0 to 2189
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             2190 non-null   int64  
 1   day            2190 non-null   int64  
 2   pressure       2190 non-null   float64
 3   maxtemp        2190 non-null   float64
 4   temparature    2190 non-null   float64
 5   mintemp        2190 non-null   float64
 6   dewpoint       2190 non-null   float64
 7   humidity       2190 non-null   float64
 8   cloud          2190 non-null   float64
 9   sunshine       2190 non-null   float64
 10  winddirection  2190 non-null   float64
 11  windspeed      2190 non-null   float64
 12  rainfall       2190 non-null   int64  
dtypes: float64(10), int64(3)
memory usage: 222.6 KB


In [5]:
train.isnull().sum()

id               0
day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64

In [6]:
test = pd.read_csv(r'c:\Users\omarf\Downloads\playground-series-s5e3\test.csv')
test.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Combine train and test data
df = pd.concat([train.drop(columns=['rainfall']), test], ignore_index=True)

# Handle missing values by imputing with the mean
df_imputed = df.select_dtypes(include=['float64', 'int64']).fillna(df.mean())

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_imputed)

# Apply PCA
pca = PCA()
pca_data = pca.fit_transform(scaled_data)

# Create a dataframe for PCA results
pca_df = pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(pca_data.shape[1])])
pca_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12
0,-1.630651,0.833814,-2.398448,-0.267003,-0.422082,-0.174096,-0.161073,0.409853,-0.012802,0.333877,0.249478,-0.000995
1,-3.300306,1.355496,-2.375545,-0.22703,-0.252109,-0.317218,0.77832,0.240525,-0.048277,0.241379,0.300778,0.110998
2,-2.897559,-3.041935,-2.494526,-0.041413,-0.094765,0.172785,0.856516,0.564757,0.005101,-0.340581,0.046536,-0.081707
3,-2.722696,1.86845,-2.155256,-0.174297,1.184143,-0.583479,0.542189,-0.493244,0.0682,0.196433,0.178662,0.029571
4,-2.735276,-3.953058,-2.17871,-0.231033,0.779283,1.196522,-1.868527,0.096821,-0.975295,-0.32272,-0.048935,0.108724


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Define features and target
X = df_imputed.drop(columns=['id', 'day'])  # Drop irrelevant columns
y = train['rainfall']  # Target variable from the train dataset

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X[:len(train)], y, test_size=0.2, random_state=42)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)
log_reg_preds = log_reg.predict_proba(X_test)[:, 1]
log_reg_auc = roc_auc_score(y_test, log_reg_preds)

# Decision Tree
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)
dt_preds = decision_tree.predict_proba(X_test)[:, 1]
dt_auc = roc_auc_score(y_test, dt_preds)

# Random Forest
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
rf_preds = random_forest.predict_proba(X_test)[:, 1]
rf_auc = roc_auc_score(y_test, rf_preds)

# Print AUC scores
print(f"Logistic Regression AUC: {log_reg_auc:.4f}")
print(f"Decision Tree AUC: {dt_auc:.4f}")
print(f"Random Forest AUC: {rf_auc:.4f}")

Logistic Regression AUC: 0.8724
Decision Tree AUC: 0.7283
Random Forest AUC: 0.8615


In [12]:
# Create a DataFrame for submission
submission = pd.DataFrame({
    'id': test['id'],
    'rainfall': random_forest.predict(test.drop(columns=['id', 'day']))
})

# Save the submission file
submission.to_csv(r'c:\Users\omarf\Downloads\playground-series-s5e3\submission.csv', index=False)