In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer

weather_data = pd.read_csv('C:/Users/Vaibhav/Desktop/Project2/weather.csv')

print(weather_data.info())

print(weather_data.describe())

print(weather_data.isnull().sum())

# Deleting the columns having the 80% of data values missing
weather_data = weather_data.dropna(thresh=len(weather_data)*0.8, axis=1)
weather_data = weather_data.fillna(weather_data.mean(numeric_only=True)) # [Taking only numerical columns]

#Using apply function to replace the missing cell with the mode of the corresponding column
weather_data = weather_data.apply(lambda x: x.fillna(x.mode()[0] if len(x.mode()) > 0 else np.nan))

print(weather_data.isnull().sum())

# Use the parameters you require for pair plot analysis in numerical_columns given below
numerical_variables = weather_data.select_dtypes(include=np.number).columns.tolist()
print("All numerical variables:",numerical_variables)

# Handling outliers for each numerical column
for column in numerical_variables:
    # Calculate upper and lower limits based on the characteristics of each column
    upper_limit = weather_data[column].mean() + 3 * weather_data[column].std()
    lower_limit = weather_data[column].mean() - 3 * weather_data[column].std()

    # Replace values exceeding the limits with the limits
    weather_data[column] = np.where(weather_data[column] > upper_limit, upper_limit, weather_data[column])
    weather_data[column] = np.where(weather_data[column] < lower_limit, lower_limit, weather_data[column])

# Encode categorical variables RainToday and RainTomorrow, and here it is binary encoding
weather_data['RainToday'] = weather_data['RainToday'].map({'No': 0, 'Yes': 1})
weather_data['RainTomorrow'] = weather_data['RainTomorrow'].map({'No': 0, 'Yes': 1})

# Note:Make sure to run it  



# Choose a target variable and use all available numerical columns as predictor variables

current_target_variable = 'MaxTemp'
current_predictor_variables = [var for var in numerical_variables if var != current_target_variable]

# Split the dataset into training and testing sets as X_train, X_test, y_train and y_test
X = weather_data[current_predictor_variables]
y = weather_data[current_target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model named as model
model = LinearRegression()

# Train the model on the training set(X_train, y_train) by fitting it in model created
model.fit(X_train, y_train)

# Make predictions(y_pred) on the testing set(X_test)
y_pred = model.predict(X_test)

results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

results.index = results.index + 2  # Increase each index by 2
results.index.name = 'Row no.'  # Add a heading for the index column

print(results)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        366 non-null    float64
 1   MaxTemp        366 non-null    float64
 2   Rainfall       366 non-null    float64
 3   Evaporation    366 non-null    float64
 4   Sunshine       363 non-null    float64
 5   WindGustDir    363 non-null    object 
 6   WindGustSpeed  364 non-null    float64
 7   WindDir9am     335 non-null    object 
 8   WindDir3pm     365 non-null    object 
 9   WindSpeed9am   359 non-null    float64
 10  WindSpeed3pm   366 non-null    int64  
 11  Humidity9am    366 non-null    int64  
 12  Humidity3pm    366 non-null    int64  
 13  Pressure9am    366 non-null    float64
 14  Pressure3pm    366 non-null    float64
 15  Cloud9am       366 non-null    int64  
 16  Cloud3pm       366 non-null    int64  
 17  Temp9am        366 non-null    float64
 18  Temp3pm   