In [1]:
import pandas as pd
weather_data = pd.read_csv('C:/Users/shell/Downloads/weatherAUS.csv')
print(weather_data.head())
print(weather_data.info())
missing_values = weather_data.isnull().sum()
print(missing_values[missing_values > 0])


         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity3pm  Pressure9am  \
0           W           44.0          W  ...        22.0       1007.7   
1         WNW           44.0        NNW  ...        25.0       1010.6   
2         WSW           46.0          W  ...        30.0       1007.6   
3          NE           24.0         SE  ...        16.0       1017.6   
4           W           41.0        ENE  ...        33.0       1010.8   

   Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  RISK_MM  \
0       1007.1       8

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

weather_data['Date'] = pd.to_datetime(weather_data['Date'])

numerical_cols = weather_data.select_dtypes(include=['float64']).columns
imputer = SimpleImputer(strategy='mean')
weather_data[numerical_cols] = imputer.fit_transform(weather_data[numerical_cols])

categorical_cols = weather_data.select_dtypes(include=['object']).columns
imputer = SimpleImputer(strategy='most_frequent')
weather_data[categorical_cols] = imputer.fit_transform(weather_data[categorical_cols])

label_encoder = LabelEncoder()
weather_data['RainTomorrow'] = label_encoder.fit_transform(weather_data['RainTomorrow'])
weather_data['RainToday'] = label_encoder.fit_transform(weather_data['RainToday'])

weather_data = pd.get_dummies(weather_data, columns=['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], drop_first=True)

print(weather_data.head())




        Date  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0 2008-12-01     13.4     22.9       0.6     5.469824  7.624853   
1 2008-12-02      7.4     25.1       0.0     5.469824  7.624853   
2 2008-12-03     12.9     25.7       0.0     5.469824  7.624853   
3 2008-12-04      9.2     28.0       0.0     5.469824  7.624853   
4 2008-12-05     17.5     32.3       1.0     5.469824  7.624853   

   WindGustSpeed  WindSpeed9am  WindSpeed3pm  Humidity9am  ...  \
0           44.0          20.0          24.0         71.0  ...   
1           44.0           4.0          22.0         44.0  ...   
2           46.0          19.0          26.0         38.0  ...   
3           24.0          11.0           9.0         45.0  ...   
4           41.0           7.0          20.0         82.0  ...   

   WindDir3pm_NNW  WindDir3pm_NW  WindDir3pm_S  WindDir3pm_SE  WindDir3pm_SSE  \
0               0              0             0              0               0   
1               0              0      

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Split the data into features and target
X = weather_data.drop(columns=['Date', 'RainTomorrow'])
y = weather_data['RainTomorrow']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=4000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(accuracy, conf_matrix, class_report)
