In [7]:
import pandas as pd

# Load the dataset
file_path = r'C:\Users\shahr\Downloads\Rainfall.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Summary statistics of numerical columns
print(df.describe())

# Check data types of columns
print(df.dtypes)


         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  Humidity3pm  \
0           W           44.0          W  ...        71.0         22.0   
1         WNW           44.0        NNW  ...        44.0         25.0   
2         WSW           46.0          W  ...        38.0         30.0   
3          NE           24.0         SE  ...        45.0         16.0   
4           W           41.0        ENE  ...        82.0         33.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1007.7    

In [8]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = r'C:\Users\shahr\Downloads\Rainfall.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Handle missing values
# Check which columns have missing values
print(df.isnull().sum())

# Impute missing values for numerical columns with mean
numerical_cols = df.select_dtypes(include='number').columns
imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

# Impute missing values for categorical columns with mode
categorical_cols = df.select_dtypes(include='object').columns
imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer.fit_transform(df[categorical_cols])

# Convert categorical variables into numerical representations if necessary
label_encoder = LabelEncoder()
for col in categorical_cols:
    if df[col].dtype == 'object':
        df[col] = label_encoder.fit_transform(df[col])

# Verify if there are any remaining missing values
print(df.isnull().sum())

# Verify changes and final dataset
print(df.head())


Date                0
Location            0
MinTemp            75
MaxTemp            60
Rainfall          240
Evaporation      3512
Sunshine         3994
WindGustDir       991
WindGustSpeed     991
WindDir9am        829
WindDir3pm        308
WindSpeed9am       76
WindSpeed3pm      107
Humidity9am        59
Humidity3pm       102
Pressure9am      1309
Pressure3pm      1312
Cloud9am         2421
Cloud3pm         2455
Temp9am            56
Temp3pm            96
RainToday         240
RainTomorrow      239
dtype: int64
Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64
   Date  Location  MinTemp  Max

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming df is already prepared from previous steps
# Ensure target variable RainTomorrow is encoded if necessary

# Feature Selection/Engineering
# Drop unnecessary columns if needed
# Example: Dropping Date and Location columns for simplicity
df = df.drop(['Date', 'Location'], axis=1)

# Splitting the dataset into features (X) and target (y)
X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (6740, 20)
Shape of X_test: (1685, 20)
Shape of y_train: (6740,)
Shape of y_test: (1685,)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming X_train, X_test, y_train, y_test are already prepared from previous steps

# Initialize the model (Logistic Regression in this example)
model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8468842729970326

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.94      0.91      1314
           1       0.71      0.51      0.60       371

    accuracy                           0.85      1685
   macro avg       0.79      0.73      0.75      1685
weighted avg       0.84      0.85      0.84      1685


Confusion Matrix:
[[1237   77]
 [ 181  190]]


In [11]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1.0, 10.0],  # Regularization parameter
    'penalty': ['l1', 'l2']  # Regularization penalty ('l1' for Lasso, 'l2' for Ridge)
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=LogisticRegression(max_iter=1000, random_state=42),
                           param_grid=param_grid,
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy',  # Use accuracy as the metric for evaluation
                           verbose=1,
                           n_jobs=-1)  # Use all available CPU cores

# Perform grid search to find the best parameters
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Accuracy:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print("\nAccuracy (Best Model):", accuracy_best)

# Classification report for the best model
print("\nClassification Report (Best Model):")
print(classification_report(y_test, y_pred_best))

# Confusion matrix for the best model
print("\nConfusion Matrix (Best Model):")
print(confusion_matrix(y_test, y_pred_best))


Fitting 5 folds for each of 6 candidates, totalling 30 fits


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\shahr\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\shahr\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\shahr\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



Best Parameters: {'C': 0.1, 'penalty': 'l2'}
Best Cross-validation Accuracy: 0.8219584569732937

Accuracy (Best Model): 0.8468842729970326

Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.87      0.94      0.91      1314
           1       0.71      0.51      0.59       371

    accuracy                           0.85      1685
   macro avg       0.79      0.73      0.75      1685
weighted avg       0.84      0.85      0.84      1685


Confusion Matrix (Best Model):
[[1238   76]
 [ 182  189]]


In [12]:
import joblib

# Assuming best_model is the best estimator obtained from GridSearchCV

# Save the best model to a file
model_filename = 'best_logistic_regression_model.pkl'
joblib.dump(best_model, model_filename)

print(f"Best model saved to {model_filename}")


Best model saved to best_logistic_regression_model.pkl


In [13]:
# Load the saved model
loaded_model = joblib.load(model_filename)

# Example: Predict using the loaded model (assuming X_new is new data)
# y_pred_new = loaded_model.predict(X_new)
