In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer

weather_data = pd.read_csv('C:/Users/Vaibhav/Desktop/Project2/weather.csv')

print(weather_data.info())

print(weather_data.describe())

print(weather_data.isnull().sum())

# Deleting the columns having the 80% of data values missing
weather_data = weather_data.dropna(thresh=len(weather_data)*0.8, axis=1)
weather_data = weather_data.fillna(weather_data.mean(numeric_only=True)) # [Taking only numerical columns]

#Using apply function to replace the missing cell with the mode of the corresponding column
weather_data = weather_data.apply(lambda x: x.fillna(x.mode()[0] if len(x.mode()) > 0 else np.nan))

print(weather_data.isnull().sum())

# Use the parameters you require for pair plot analysis in numerical_columns given below
numerical_variables = weather_data.select_dtypes(include=np.number).columns.tolist()
print("All numerical variables:",numerical_variables)

# Handling outliers for each numerical column
for column in numerical_variables:
    # Calculate upper and lower limits based on the characteristics of each column
    upper_limit = weather_data[column].mean() + 3 * weather_data[column].std()
    lower_limit = weather_data[column].mean() - 3 * weather_data[column].std()

    # Replace values exceeding the limits with the limits
    weather_data[column] = np.where(weather_data[column] > upper_limit, upper_limit, weather_data[column])
    weather_data[column] = np.where(weather_data[column] < lower_limit, lower_limit, weather_data[column])

# Encode categorical variables RainToday and RainTomorrow, and here it is binary encoding
weather_data['RainToday'] = weather_data['RainToday'].map({'No': 0, 'Yes': 1})
weather_data['RainTomorrow'] = weather_data['RainTomorrow'].map({'No': 0, 'Yes': 1})

# Note:Make sure to run it  



# Choose a target variable as in above regression model and use all available numerical columns as predictor variables
current_target_variable1 = 'RainToday'
current_predictor_variables1 = [var for var in numerical_variables if var != current_target_variable1]

# Split the dataset into training and testing sets as X_train1, X_test1, y_train1 and y_test1
X1 = weather_data[current_predictor_variables1]
y1 = weather_data[current_target_variable1]
X_trainset1, X_testset1, y_trainset1, y_testset1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Create a logistic regression model named as model1
model1 = LogisticRegression(max_iter=1000)  # Increased max_iter to avoid convergence warning

# Fit the model on the training set
model1.fit(X_trainset1, y_trainset1)

# Make predictions on the testing set
y_predict1 = model1.predict(X_testset1)


# Save the current display options(to decide no. of rows to print)
original_max_rows = pd.get_option('display.max_rows')

# Set display option to show all rows
pd.set_option('display.max_rows', None)

# Display the actual and predicted values with index and heading
results1 = pd.DataFrame({'Actual': y_testset1, 'Predicted': y_predict1})

# Converting back '0' to 'No' and '1 to 'True 
results1['Actual'] = results1['Actual'].map({0: 'No', 1: 'Yes'})
results1['Predicted'] = results1['Predicted'].map({0: 'No', 1: 'Yes'})

results1.index = results1.index + 2  # Increase each index by 2
results1.index.name = 'Row no.'  # Add a heading for the index column

print(results1)

# Reset display option to its original value(so that other dataframes dont get effected later)
pd.set_option('display.max_rows', original_max_rows)


# Display the accuracy and confusion matrix
accuracy1 = accuracy_score(y_testset1, y_predict1)
conf_matrix1 = confusion_matrix(y_testset1, y_predict1)

print(f'Accuracy: {accuracy1}')
print('Confusion Matrix:')
print(conf_matrix1)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        366 non-null    float64
 1   MaxTemp        366 non-null    float64
 2   Rainfall       366 non-null    float64
 3   Evaporation    366 non-null    float64
 4   Sunshine       363 non-null    float64
 5   WindGustDir    363 non-null    object 
 6   WindGustSpeed  364 non-null    float64
 7   WindDir9am     335 non-null    object 
 8   WindDir3pm     365 non-null    object 
 9   WindSpeed9am   359 non-null    float64
 10  WindSpeed3pm   366 non-null    int64  
 11  Humidity9am    366 non-null    int64  
 12  Humidity3pm    366 non-null    int64  
 13  Pressure9am    366 non-null    float64
 14  Pressure3pm    366 non-null    float64
 15  Cloud9am       366 non-null    int64  
 16  Cloud3pm       366 non-null    int64  
 17  Temp9am        366 non-null    float64
 18  Temp3pm   