In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

root_path = Path().resolve().parent

In [None]:
df = pd.read_csv(os.path.join(root_path, 'data', 'csv', 'weatherAUS.csv'))
df = df.rename(columns={'Date': 'date',
                        'Location': 'location',
                        'MinTemp': 'min_temp',
                        'MaxTemp': 'max_temp',
                        'Rainfall': 'rainfall',
                        'Evaporation': 'evaporation',
                        'Sunshine': 'sunshine',
                        'WindGustDir': 'wind_gust_dir',
                        'WindGustSpeed': 'wind_gust_speed',
                        'WindDir9am': 'wind_dir_9am',
                        'WindDir3pm': 'wind_dir_3pm',
                        'WindSpeed9am': 'wind_speed_9am',
                        'WindSpeed3pm': 'wind_speed_3pm',
                        'Humidity9am': 'humidity_9am',
                        'Humidity3pm': 'humidity_3pm',
                        'Pressure9am': 'pressure_9am',
                        'Pressure3pm': 'pressure_3pm',
                        'Cloud9am': 'cloud_9am',
                        'Cloud3pm': 'cloud_3pm',
                        'Temp9am': 'temp_9am',
                        'Temp3pm': 'temp_3pm',
                        'RainToday': 'rain_today',
                        'RainTomorrow': 'rain_tomorrow'})
df = df.loc[(df['location'] == 'Canberra') |
            (df['location'] == 'Sydney') |
            (df['location'] == 'Darwin') |
            (df['location'] == 'Melbourne') |
            (df['location'] == 'Brisbane City')]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
plot = df.plot(x='date', y='rainfall')

In [None]:
import matplotlib.pyplot as plt

df['min_temp'].hist(bins=20, alpha=0.5, label='min_temp')
df['max_temp'].hist(bins=20, alpha=0.5, label='max_temp')
plt.xlabel('Temperature')
plt.ylabel('Nb occurrences')
plt.legend()
plt.title('Distribution of Min and Max Temperatures')
plt.show()

In [None]:
df.boxplot(column='wind_gust_speed', by='location', rot=90)
plt.ylabel('Wind Gust Speed')
plt.title('Wind Gust Speed by Location')
plt.show()

In [None]:
plt.scatter(df['temp_3pm'], df['humidity_3pm'], alpha=0.5)
plt.xlabel('Temperature at 3 PM')
plt.ylabel('Humidity at 3 PM')
plt.title('Temperature vs. Humidity at 3 PM')
plt.show()

In [None]:
import seaborn as sns

numerical_columns = ['min_temp', 'max_temp', 'rainfall', 'wind_gust_speed',
                     'wind_speed_9am', 'wind_speed_3pm', 'humidity_9am',
                     'humidity_3pm', 'pressure_9am', 'pressure_3pm',
                     'cloud_9am', 'cloud_3pm', 'temp_9am', 'temp_3pm']

corr = df[numerical_columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
df = df.dropna(subset=['rain_today', 'rain_tomorrow'])

In [None]:
numerical_columns = list(df.select_dtypes(include='number').columns)
print(numerical_columns)
categorical_columns = list(df.select_dtypes(include='object').columns)
print(categorical_columns)

In [None]:
from sklearn.model_selection import train_test_split

y = df['rain_tomorrow']
X = df.drop(['rain_tomorrow', 'date'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

print("Train Set:", X_train.shape)
print("Test Set:", X_test.shape)

In [None]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [None]:
from sklearn.impute import SimpleImputer

imputer_numerical = SimpleImputer(missing_values=np.nan, strategy='median')

X_train.loc[:, numerical_columns] = imputer_numerical.fit_transform(X_train[numerical_columns])
X_test.loc[:, numerical_columns] = imputer_numerical.transform(X_test[numerical_columns])

imputer_object = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

categorical_columns_restricted = list(X_train.select_dtypes(include='object').columns)
X_train.loc[:, categorical_columns_restricted] = imputer_object.fit_transform(X_train[categorical_columns_restricted])
X_test.loc[:, categorical_columns_restricted] = imputer_object.transform(X_test[categorical_columns_restricted])

In [None]:
from sklearn.preprocessing import LabelEncoder

location_encoder = LabelEncoder()
X_train['location'] = location_encoder.fit_transform(X_train['location'])
X_test['location'] = location_encoder.transform(X_test['location'])
location_mapping = dict(zip(location_encoder.classes_, location_encoder.transform(location_encoder.classes_)))
print(location_mapping)

rain_today_encoder = LabelEncoder()
X_train['rain_today'] = rain_today_encoder.fit_transform(X_train['rain_today'])
X_test['rain_today'] = rain_today_encoder.transform(X_test['rain_today'])
rain_mapping = dict(zip(rain_today_encoder.classes_, rain_today_encoder.transform(rain_today_encoder.classes_)))
print(rain_mapping)


In [None]:
from sklearn.preprocessing import OneHotEncoder

columns_to_encode = ['wind_gust_dir', 'wind_dir_9am', 'wind_dir_3pm']

oneh_encoder = OneHotEncoder(drop='first', sparse_output=False)

X_train_encoded_cols = oneh_encoder.fit_transform(X_train[columns_to_encode])
X_test_encoded_cols = oneh_encoder.transform(X_test[columns_to_encode])

X_train_encoded_df = pd.DataFrame(X_train_encoded_cols, columns=oneh_encoder.get_feature_names_out(columns_to_encode))
X_test_encoded_df = pd.DataFrame(X_test_encoded_cols, columns=oneh_encoder.get_feature_names_out(columns_to_encode))

X_train.drop(columns=columns_to_encode, inplace=True)
X_train = pd.concat([X_train, X_train_encoded_df], axis=1)

X_test.drop(columns=columns_to_encode, inplace=True)
X_test = pd.concat([X_test, X_test_encoded_df], axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[numerical_columns] = sc.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = sc.transform(X_test[numerical_columns])

In [None]:
X_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression

reglog = LogisticRegression()
reglog.fit(X_train, y_train)

print('Train score', reglog.score(X_train, y_train))
print('Test score', reglog.score(X_test, y_test))

In [None]:
from sklearn.metrics import classification_report

y_pred = reglog.predict(X_test)

display(pd.crosstab(y_test,y_pred, rownames=['Reality'], colnames=['Prediction']))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

print('Train score', clf.score(X_train, y_train))
print('Test score', clf.score(X_test, y_test))

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

display(pd.crosstab(y_test,y_pred, rownames=['Reality'], colnames=['Prediction']))
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
feat_importances = pd.DataFrame(clf.feature_importances_, index=X_train.columns, columns=["Importance"])
feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
feat_importances.plot(kind='bar', figsize=(8,6))

In [None]:
X_train_new = X_train[['humidity_3pm', 'wind_gust_speed', 'pressure_3pm']]
X_test_new = X_test[['humidity_3pm', 'wind_gust_speed', 'pressure_3pm']]

clf = tree.DecisionTreeClassifier(random_state=42) 
  
clf.fit(X_train_new, y_train)

print(clf.score(X_train_new, y_train))
print(clf.score(X_test_new, y_test))

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test_new)

display(pd.crosstab(y_test, y_pred, rownames=['Reality'], colnames=['Prediction']))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.tree import plot_tree 

clf = tree.DecisionTreeClassifier(random_state=42,max_depth = 3) 

clf.fit(X_train_new, y_train)

fig, ax = plt.subplots(figsize=(20, 20))  

plot_tree(clf, 
          feature_names = ['humidity_3pm', 'wind_gust_speed', 'pressure_3pm', 'min_temp', 'cloud_3pm', 'humidity_9am', 'pressure_9am', 'wind_speed_9am', 'rainfall', 'max_temp', 'temp_3pm', 'wind_speed_3pm', 'temp_9am', 'cloud_9am'],
          class_names = ['Yes', 'No'],
          filled = True, 
          rounded = True)

plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print('Train score', rf.score(X_train, y_train))
print('Test score', rf.score(X_test, y_test))

In [None]:
from sklearn.metrics import classification_report

y_pred = rf.predict(X_test)

display(pd.crosstab(y_test,y_pred))

print(classification_report(y_test, y_pred))

In [None]:
y_test.value_counts(normalize=True)

In [None]:
from imblearn.over_sampling import RandomOverSampler

rOs = RandomOverSampler()
X_ro, y_ro = rOs.fit_resample(X_train, y_train)
print('Oversampled :', dict(pd.Series(y_ro).value_counts(normalize = True)))

In [None]:
reglog = LogisticRegression()
reglog.fit(X_ro, y_ro)

print('Train score', reglog.score(X_train, y_train))
print('Test score', reglog.score(X_test, y_test))

y_pred = reglog.predict(X_test)

display(pd.crosstab(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_ro, y_ro)

print('Train score', clf.score(X_ro, y_ro))
print('Test score', clf.score(X_test, y_test))

y_pred = clf.predict(X_test)

display(pd.crosstab(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
rf = RandomForestClassifier()
rf.fit(X_ro, y_ro)

print('Train score', rf.score(X_ro, y_ro))
print('Test score', rf.score(X_test, y_test))

y_pred = rf.predict(X_test)

display(pd.crosstab(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 1000, 10000]
}

reglog = LogisticRegression()

grid_search = GridSearchCV(estimator=reglog, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_ro, y_ro)

print(grid_search.best_params_)

In [None]:
reglog = LogisticRegression(max_iter=100, C=10, penalty='l2')
reglog.fit(X_ro, y_ro)

print('Train score', reglog.score(X_ro, y_ro))
print('Test score', reglog.score(X_test, y_test))

y_pred = reglog.predict(X_test)

display(pd.crosstab(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
param_grid = {
    'max_depth': [5, 9, None],
    'n_estimators': [100, 200],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_ro, y_ro)

print(grid_search.best_params_)

In [None]:
rf = RandomForestClassifier(max_depth=9, min_samples_leaf=1, min_samples_split=2, n_estimators=200)
rf.fit(X_ro, y_ro)

print('Train score', rf.score(X_ro, y_ro))
print('Test score', rf.score(X_test, y_test))

y_pred = rf.predict(X_test)

display(pd.crosstab(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [36]:
import pandas as pd
from pathlib import Path
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import warnings

warnings.filterwarnings('ignore')

# Load data
root_path = Path().resolve().parent
df = pd.read_csv(os.path.join(root_path, 'data', 'csv', 'weatherAUS.csv'))

# Rename columns
df = df.rename(columns={'Date': 'date',
                        'Location': 'location',
                        'MinTemp': 'min_temp',
                        'MaxTemp': 'max_temp',
                        'Rainfall': 'rainfall',
                        'Evaporation': 'evaporation',
                        'Sunshine': 'sunshine',
                        'WindGustDir': 'wind_gust_dir',
                        'WindGustSpeed': 'wind_gust_speed',
                        'WindDir9am': 'wind_dir_9am',
                        'WindDir3pm': 'wind_dir_3pm',
                        'WindSpeed9am': 'wind_speed_9am',
                        'WindSpeed3pm': 'wind_speed_3pm',
                        'Humidity9am': 'humidity_9am',
                        'Humidity3pm': 'humidity_3pm',
                        'Pressure9am': 'pressure_9am',
                        'Pressure3pm': 'pressure_3pm',
                        'Cloud9am': 'cloud_9am',
                        'Cloud3pm': 'cloud_3pm',
                        'Temp9am': 'temp_9am',
                        'Temp3pm': 'temp_3pm',
                        'RainToday': 'rain_today',
                        'RainTomorrow': 'rain_tomorrow'})

# Filter locations
df = df[df['location'].isin(['Canberra', 'Sydney', 'Darwin', 'Melbourne', 'Brisbane City'])]

# Drop rows with missing target values
df.dropna(subset=['rain_today', 'rain_tomorrow'], inplace=True)

# Define features and target
X = df.drop(['rain_tomorrow', 'date'], axis=1)
y = df['rain_tomorrow']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define columns
numerical_columns = X_train.select_dtypes(include='number').columns.tolist()
categorical_columns = X_train.select_dtypes(include='object').columns.tolist()
categorical_label_columns = ['location', 'rain_today']
categorical_onehot_columns = ['wind_gust_dir', 'wind_dir_9am', 'wind_dir_3pm']

# Preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_columns),
    ('categorical', categorical_pipeline, categorical_columns)
])

# Define the model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100],
    'classifier__max_depth': [2],
    'classifier__min_samples_split': [2],
    'classifier__min_samples_leaf': [1]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy', error_score='raise')
grid_search.fit(X_train, y_train)

# Save the best model
best_model_pipeline = os.path.join(root_path, 'model', 'random_forest_model.joblib')
joblib.dump(grid_search.best_estimator_, best_model_pipeline)

['/Users/tanguyboulard/Programmation/DataScientest-project-DEC23/data/model/random_forest_model.joblib']

In [37]:
# Load the model
loaded_pipeline = joblib.load(best_model_pipeline)
loaded_pipeline

In [38]:
# Evaluate model
print('Train score:', loaded_pipeline.score(X_train, y_train))
print('Test score:', loaded_pipeline.score(X_test, y_test))

y_pred = loaded_pipeline.predict(X_test)

# Display results
display(pd.crosstab(y_test, y_pred))
print(classification_report(y_test, y_pred))

Train score: 0.8037430967478012
Test score: 0.7942740286298569


col_0,No,Yes
rain_tomorrow,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1837,11
Yes,492,105


              precision    recall  f1-score   support

          No       0.79      0.99      0.88      1848
         Yes       0.91      0.18      0.29       597

    accuracy                           0.79      2445
   macro avg       0.85      0.58      0.59      2445
weighted avg       0.82      0.79      0.74      2445



In [39]:
import numpy as np

new_data = pd.DataFrame({
    'date': ['2008-12-12'],
    'location': ['Albury'],
    'min_temp': [15.9],
    'max_temp': [21.7],
    'evaporation': [np.nan],
    'sunshine': [np.nan],
    'rainfall': [5.0],
    'wind_gust_dir': ['NNE'],
    'wind_gust_speed': [31],
    'wind_dir_9am': ['NE'],
    'wind_speed_9am': [15],
    'wind_dir_3pm': ['ENE'],
    'wind_speed_3pm': [13],
    'humidity_9am': [120],
    'humidity_3pm': [150],
    'pressure_9am': [1010.5],
    'pressure_3pm': [1004.2],
    'cloud_9am': [8],
    'cloud_3pm': [8],
    'temp_9am': [15.9],
    'temp_3pm': [17]
})
new_data['rain_today'] = 'Yes' if new_data['rainfall'][0] >= 1 else 'No'
new_data

Unnamed: 0,date,location,min_temp,max_temp,evaporation,sunshine,rainfall,wind_gust_dir,wind_gust_speed,wind_dir_9am,...,wind_speed_3pm,humidity_9am,humidity_3pm,pressure_9am,pressure_3pm,cloud_9am,cloud_3pm,temp_9am,temp_3pm,rain_today
0,2008-12-12,Albury,15.9,21.7,,,5.0,NNE,31,NE,...,13,120,150,1010.5,1004.2,8,8,15.9,17,Yes


In [40]:
predicted_rain_tomorrow = loaded_pipeline.predict(new_data)
print('Predicted rain_tomorrow:', predicted_rain_tomorrow[0])

Predicted rain_tomorrow: No
