In [1]:
# Section 1: Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv('weatherAUS.csv')

# Drop columns with too many missing values
df.drop(['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1, inplace=True)

# Drop the Date column as it's not needed for the model
df.drop(['Date'], axis=1, inplace=True)

# Handle missing values using mean for numerical columns and most frequent for categorical columns
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Apply imputers
df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# Ensure the target column exists and is correctly encoded if necessary
if 'RainTomorrow' in df.columns:
    df['RainTomorrow'] = df['RainTomorrow'].map({'Yes': 1, 'No': 0})

# One-hot encode all categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
if not categorical_cols.empty:
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Split the dataset
X = df.drop(['RainTomorrow'], axis=1)
y = df['RainTomorrow']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Section 2: PCA
from sklearn.decomposition import PCA

# Apply PCA to reduce dimensionality
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Section 3: Simple Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_pca, y_train)
y_pred_log_reg = log_reg.predict(X_test_pca)
log_reg_acc = accuracy_score(y_test, y_pred_log_reg)
lrp = precision_score(y_test, y_pred_log_reg)
lrf1 = f1_score(y_test, y_pred_log_reg)
lrr = recall_score(y_test, y_pred_log_reg)

# Decision Tree
tree = DecisionTreeClassifier()
tree.fit(X_train_pca, y_train)
y_pred_tree = tree.predict(X_test_pca)
tree_acc = accuracy_score(y_test, y_pred_tree)
treep = precision_score(y_test, y_pred_tree)
treer = recall_score(y_test, y_pred_tree)
treef1 = f1_score(y_test, y_pred_tree)

# Section 4: Ensemble Model
from sklearn.ensemble import RandomForestClassifier

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train_pca, y_train)
y_pred_rf = rf.predict(X_test_pca)
rf_acc = accuracy_score(y_test, y_pred_rf)
rfp = precision_score(y_test, y_pred_rf)
rf1 = f1_score(y_test, y_pred_rf)
rfrr = recall_score(y_test, y_pred_rf)

# Section 5: Deployment
# A simple function for prediction
def predict_rain(features):
    processed_features = scaler.transform([features])
    pca_features = pca.transform(processed_features)
    prediction = rf.predict(pca_features)
    return 'Yes' if prediction[0] == 1 else 'No'

# Section 6: Prediction
# Example prediction
example_features = X_test[0]
prediction = predict_rain(example_features)
print(f'Prediction for the given features: {prediction}')

# Section 7: Comparison among Models
print("Logistic Regression Accuracy:", log_reg_acc)
print("Decision Tree Accuracy:", tree_acc)
print("Random Forest Accuracy:", rf_acc)

# Section 8: Report
report = f"""
Machine Learning Model Comparison:
1. Logistic Regression: 
- Accuracy: {log_reg_acc:.3f}
- Precision: {lrp:.3f}
- Recall: {lrr:.3f}
- F1 Score: {lrf1:.3f}
2. Decision Tree:
- Accuracy: {tree_acc:.3f}
- Precision: {treep:.3f}
- Recall: {treer:.3f}
- F1 Score: {treef1:.3f}
3. Random Forest:
- Accuracy: {rf_acc:.3f}
- Precision: {rfp:.3f}
- Recall: {rfrr:.3f}
- F1 Score: {rf1:.3f}
"""
print(report)

# Section 9: Model Saving
import joblib
import os

# Create a directory to store the model
os.makedirs('./models', exist_ok=True)

# Save the trained model
with open('./models/logreg.joblib', 'wb') as model_file:
    joblib.dump(log_reg, model_file)
print("Model saved successfully as 'logreg.joblib' in the './models' directory.")


Prediction for the given features: No
Logistic Regression Accuracy: 0.8179568266190018
Decision Tree Accuracy: 0.7554310463357624
Random Forest Accuracy: 0.8256565378798295

Machine Learning Model Comparison:
1. Logistic Regression: 
- Accuracy: 0.818
- Precision: 0.673
- Recall: 0.341
- F1 Score: 0.453
2. Decision Tree:
- Accuracy: 0.755
- Precision: 0.447
- Recall: 0.456
- F1 Score: 0.452
3. Random Forest:
- Accuracy: 0.826
- Precision: 0.675
- Recall: 0.404
- F1 Score: 0.506

Model saved successfully as 'logreg.joblib' in the './models' directory.


