In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
# Load the datas
file_path = 'weatherAUS 2.csv'
df = pd.read_csv(file_path)

In [None]:
# Print first few rows
print("First few rows of the dataframe:")
print(df.head())

In [None]:
# Print basic info about the data
df.info()

In [None]:
# Extract date in numerical format
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [None]:
# Apply binary encoding
label_encoder = LabelEncoder()
df['RainToday'] = label_encoder.fit_transform(df['RainToday'])
df['RainTomorrow'] = label_encoder.fit_transform(df['RainTomorrow'])

In [None]:
# Apply one hot encoding
categorical_cols = ['WindGustDir', 'WindDir9am', 'WindDir3pm']
df = pd.get_dummies(df, columns=categorical_cols)

In [None]:
# Check skewness of data
num_cols = df.select_dtypes('number').columns
skew_vals = df[num_cols].skew()
skew_vals

In [None]:
# Fixing the skewness
skewness = df[num_cols].skew().sort_values(ascending=False)
print(skewness)

high_skew = skewness[skewness > 0.80].index
for col in high_skew:
    df[col] = df[col].apply(lambda x: np.log1p(x))

new_skewness = df[high_skew].skew().sort_values(ascending=False)
print(new_skewness)

In [None]:
# Print statistics of the data
print(df.describe(include='all'))

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

In [None]:
# Filling the missing values
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

non_numeric_cols = df.select_dtypes(exclude=['number']).columns
for column in non_numeric_cols:
    # df[column].fillna(df[column].mode()[0], inplace=True)
    df[column] = df[column].fillna(df[column].mode()[0])

In [None]:
# Again check for missing values
missing_values_after = df.isnull().sum()
print(missing_values_after)

In [None]:
# Display the correlation matrix of the numerical columns
print("\nCorrelation matrix of the numerical columns:")
print(df[numeric_cols].corr())

In [None]:
# Visualize the correlation matrix
plt.figure(figsize=(20, 10))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.show()