## 0. Import library

In [None]:
# data analysis related library
import pandas as pd
import numpy as np

# some visualization related library
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
from matplotlib.colors import LogNorm
import seaborn as sns

# category_encoders and warning related library
from warnings import simplefilter
simplefilter(action='ignore')
import category_encoders 

# sklearn related ML library
from sklearn import metrics
from sklearn.metrics import roc_curve, RocCurveDisplay
from sklearn.metrics import mean_absolute_error
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc

# other useful libray
import missingno as msno
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from pylab import subplots_adjust

## 1. Read Data & Quality Analysis 

In [None]:
weather_data = pd.read_csv("weatherAUS.csv")
weather_data.info()
weather_data.shape
weather_data.head()
weather_data.describe()
weather_data.isna().sum().sum()
weather_data.isnull().sum().sort_values(ascending=False)
(weather_data.isnull().sum() / weather_data.isnull().count()).sort_values(ascending=False)
msno.dendrogram(weather_data)
plt.show()
msno.heatmap(weather_data)
plt.show()
msno.matrix(weather_data,color=(47/255,127/255,255/255))
plt.show()
msno.bar(weather_data.sample(1000), color=(255/255,151/225,0/255))
plt.show()

PCA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = 'weatherAUS.csv'
weather_data = pd.read_csv(file_path)

# Drop columns with a high amount of missing values
weather_data.drop(["Evaporation", "Sunshine", "Cloud9am", "Cloud3pm"], inplace=True, axis=1)

# Handle missing values
for col in weather_data.select_dtypes(include=['float64', 'int64']).columns:
    weather_data[col] = weather_data[col].fillna(weather_data[col].mean())

for col in weather_data.select_dtypes(include=['object']).columns:
    weather_data[col] = weather_data[col].fillna(weather_data[col].mode()[0])

# Convert categorical features to label classes
label_encoder = LabelEncoder()
for col in ['Location', 'RainToday', 'RainTomorrow', 'WindDir3pm', 'WindGustDir', 'WindDir9am']:
    weather_data[col] = label_encoder.fit_transform(weather_data[col])

# Drop the 'Date' column as it is non-numeric and causing issues
weather_data = weather_data.drop(['Date'], axis=1)

# Standardize the data
features = weather_data.drop(['RainTomorrow'], axis=1)
target = weather_data['RainTomorrow']
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Apply PCA for the original dataset
pca_original = PCA()
principal_components_original = pca_original.fit_transform(features_scaled)

# Scree plot for original and balanced datasets side by side
explained_variance_original = pca_original.explained_variance_ratio_ * 100

# Upsample the data for balancing classes
normal_weather_data = weather_data[weather_data.RainTomorrow == 0]
abnormal_weather_data = weather_data[weather_data.RainTomorrow == 1]
upsampled_data = resample(abnormal_weather_data, replace=True, random_state=123, n_samples=len(normal_weather_data))
weather_data_balanced = pd.concat([normal_weather_data, upsampled_data])

# Standardize the balanced data
features_balanced = weather_data_balanced.drop(['RainTomorrow'], axis=1)
target_balanced = weather_data_balanced['RainTomorrow']
features_balanced_scaled = scaler.fit_transform(features_balanced)

# Apply PCA for balanced dataset
pca_balanced = PCA()
principal_components_balanced = pca_balanced.fit_transform(features_balanced_scaled)
explained_variance_balanced = pca_balanced.explained_variance_ratio_ * 100

# Plotting Scree plots side by side
plt.figure(figsize=(20, 8))

plt.subplot(1, 2, 1)
sns.barplot(x=np.arange(1, len(explained_variance_original) + 1), y=explained_variance_original, color='b')
plt.plot(np.arange(1, len(explained_variance_original) + 1), explained_variance_original, 'ko-')
plt.xlabel('Dimensions')
plt.ylabel('Percentage of explained variance')
plt.title('Scree Plot - Original Dataset')

plt.subplot(1, 2, 2)
sns.barplot(x=np.arange(1, len(explained_variance_balanced) + 1), y=explained_variance_balanced, color='b')
plt.plot(np.arange(1, len(explained_variance_balanced) + 1), explained_variance_balanced, 'ko-')
plt.xlabel('Dimensions')
plt.ylabel('Percentage of explained variance')
plt.title('Scree Plot - Balanced Dataset')

plt.tight_layout()
plt.show()

# Print contribution of features to the first two principal components for original dataset
pca_2d_original = PCA(n_components=2)
principal_components_2d_original = pca_2d_original.fit_transform(features_scaled)
pca_components_original = pca_2d_original.components_
contribution_df_original = pd.DataFrame(pca_components_original.T, columns=['PC1', 'PC2'], index=features.columns)
print("Contribution of features to the first two principal components (Original Dataset):")
print(contribution_df_original)

# Print contribution of features to the first two principal components for balanced dataset
pca_2d_balanced = PCA(n_components=2)
principal_components_2d_balanced = pca_2d_balanced.fit_transform(features_balanced_scaled)
pca_components_balanced = pca_2d_balanced.components_
contribution_df_balanced = pd.DataFrame(pca_components_balanced.T, columns=['PC1', 'PC2'], index=features.columns)
print("Contribution of features to the first two principal components (Balanced Dataset):")
print(contribution_df_balanced)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import FancyArrowPatch
import numpy as np

cmap = plt.cm.plasma  

def draw_arrow(ax, start, end, color):
    arrow = FancyArrowPatch(posA=start, posB=end, arrowstyle='->',
                            color=color, linewidth=1.5, mutation_scale=15)
    ax.add_artist(arrow)

# Biplot for Original Dataset
fig, ax = plt.subplots(figsize=(10, 10))

ax.set_facecolor('white')

arrow_lengths_original = np.sqrt(np.sum(pca_2d_original.components_**2, axis=0))
max_arrow_length_original = np.max(arrow_lengths_original)
scaling_factor_original = 0.9 / max_arrow_length_original 

contributions_original = np.sqrt(np.sum(pca_2d_original.components_**2, axis=0))
contribution_normalized = 20 * (contributions_original - np.min(contributions_original)) / (np.max(contributions_original) - np.min(contributions_original))

for i, feature in enumerate(features.columns):

    contribution = contribution_normalized[i]
    scaled_x = pca_2d_original.components_[0, i] * scaling_factor_original
    scaled_y = pca_2d_original.components_[1, i] * scaling_factor_original
    color = cmap(contribution / 25)  

    draw_arrow(ax, (0, 0), (scaled_x, scaled_y), color=color)

    offset_x = -0.08 * np.sign(scaled_x)
    offset_y = -0.08 * np.sign(scaled_y)
    plt.text(scaled_x * 1.15 + offset_x, scaled_y * 1.15 + offset_y, feature,
             color=color, fontsize=9, ha='center', va='center', fontweight='medium', zorder=10)

plt.xlabel(f'Dim1 ({pca_2d_original.explained_variance_ratio_[0] * 100:.1f}%)', fontsize=12)
plt.ylabel(f'Dim2 ({pca_2d_original.explained_variance_ratio_[1] * 100:.1f}%)', fontsize=12)
plt.title('Variables - PCA', fontsize=14)

plt.axhline(0, color='black', linestyle='--', linewidth=0.6)
plt.axvline(0, color='black', linestyle='--', linewidth=0.6)
plt.gca().add_patch(plt.Circle((0, 0), 1, color='black', fill=False, linestyle='--', linewidth=0.6))
plt.gca().set_aspect('equal', adjustable='datalim')
plt.xlim(-1, 1)
plt.ylim(-1, 1)

plt.grid(True, color='lightgray', linestyle='-', linewidth=0.5)


sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=0, vmax=20))  # 修改了vmin和vmax
sm.set_array([])
cbar = plt.colorbar(sm, ax=plt.gca(), orientation='vertical', fraction=0.03, pad=0.04)
cbar.set_label('contrib', fontsize=12)

plt.tight_layout()
plt.show()

# Biplot for Balanced Dataset
fig, ax = plt.subplots(figsize=(10, 10))


ax.set_facecolor('white')

arrow_lengths_balanced = np.sqrt(np.sum(pca_2d_balanced.components_**2, axis=0))
max_arrow_length_balanced = np.max(arrow_lengths_balanced)
scaling_factor_balanced = 0.9 / max_arrow_length_balanced


contributions_balanced = np.sqrt(np.sum(pca_2d_balanced.components_**2, axis=0))

contribution_normalized_balanced = 20 * (contributions_balanced - np.min(contributions_balanced)) / (np.max(contributions_balanced) - np.min(contributions_balanced))

for i, feature in enumerate(features.columns):

    contribution = contribution_normalized_balanced[i]
    scaled_x = pca_2d_balanced.components_[0, i] * scaling_factor_balanced
    scaled_y = pca_2d_balanced.components_[1, i] * scaling_factor_balanced
    color = cmap(contribution / 25)  

    draw_arrow(ax, (0, 0), (scaled_x, scaled_y), color=color)

    offset_x = -0.1 * np.sign(scaled_x)
    offset_y = -0.05 * np.sign(scaled_y)
    plt.text(scaled_x * 1.15 + offset_x, scaled_y * 1.15 + offset_y, feature,
             color=color, fontsize=9, ha='center', va='center', fontweight='medium', zorder=10)

plt.xlabel(f'Dim1 ({pca_2d_balanced.explained_variance_ratio_[0] * 100:.1f}%)', fontsize=12)
plt.ylabel(f'Dim2 ({pca_2d_balanced.explained_variance_ratio_[1] * 100:.1f}%)', fontsize=12)
plt.title('Variables - PCA', fontsize=14)

plt.axhline(0, color='black', linestyle='--', linewidth=0.6)
plt.axvline(0, color='black', linestyle='--', linewidth=0.6)
plt.gca().add_patch(plt.Circle((0, 0), 1, color='black', fill=False, linestyle='--', linewidth=0.6))
plt.gca().set_aspect('equal', adjustable='datalim')
plt.xlim(-1, 1)
plt.ylim(-1, 1)

plt.grid(True, color='lightgray', linestyle='-', linewidth=0.5)

sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=0, vmax=20))  # 修改了vmin和vmax
sm.set_array([])
cbar = plt.colorbar(sm, ax=plt.gca(), orientation='vertical', fraction=0.03, pad=0.04)
cbar.set_label('contrib', fontsize=12)

plt.tight_layout()
plt.show()

## T-sne

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder

sns.set_style("white")

file_path = 'weatherAUS.csv'
weather_data = pd.read_csv(file_path)

weather_data.drop(["Evaporation", "Sunshine", "Cloud9am", "Cloud3pm"], inplace=True, axis=1)

for col in weather_data.select_dtypes(include=['float64', 'int64']).columns:
    weather_data[col] = weather_data[col].fillna(weather_data[col].mean())

for col in weather_data.select_dtypes(include=['object']).columns:
    weather_data[col] = weather_data[col].fillna(weather_data[col].mode()[0])

label_encoder = LabelEncoder()
for col in ['Location', 'RainToday', 'RainTomorrow', 'WindDir3pm', 'WindGustDir', 'WindDir9am']:
    weather_data[col] = label_encoder.fit_transform(weather_data[col])

weather_data = weather_data.drop(['Date'], axis=1)

features = weather_data.drop(['RainTomorrow'], axis=1)
target = weather_data['RainTomorrow']
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

pca_original = PCA(n_components=2)
principal_components_original = pca_original.fit_transform(features_scaled)
cos2_original = (principal_components_original ** 2).sum(axis=1) / (principal_components_original ** 2).sum()

normal_weather_data = weather_data[weather_data.RainTomorrow == 0]
abnormal_weather_data = weather_data[weather_data.RainTomorrow == 1]
upsampled_data = resample(abnormal_weather_data, replace=True, random_state=123, n_samples=len(normal_weather_data))
weather_data_balanced = pd.concat([normal_weather_data, upsampled_data])

features_balanced = weather_data_balanced.drop(['RainTomorrow'], axis=1)
target_balanced = weather_data_balanced['RainTomorrow']
features_balanced_scaled = scaler.fit_transform(features_balanced)

pca_balanced = PCA(n_components=2)
principal_components_balanced = pca_balanced.fit_transform(features_balanced_scaled)
cos2_balanced = (principal_components_balanced ** 2).sum(axis=1) / (principal_components_balanced ** 2).sum()

fig, axes = plt.subplots(2, 2, figsize=(20, 16))

fig.patch.set_facecolor('white')

# ----------------------------
# 1：cos² 
ax1 = axes[0, 0]
ax1.set_facecolor('white')  
scatter1 = sns.scatterplot(
    x=principal_components_original[:, 0],
    y=principal_components_original[:, 1],
    hue=cos2_original,
    palette='RdYlBu',
    alpha=0.6,
    edgecolor=None,
    ax=ax1
)
ax1.set_xlabel(f'Dim1 ({pca_original.explained_variance_ratio_[0] * 100:.1f}%)')
ax1.set_ylabel(f'Dim2 ({pca_original.explained_variance_ratio_[1] * 100:.1f}%)')
ax1.set_title('Individuals - PCA (Original Dataset)')
ax1.axhline(0, linestyle='--', color='black', linewidth=0.5)
ax1.axvline(0, linestyle='--', color='black', linewidth=0.5)

# adjust cos2_original 
legend1 = ax1.legend(title='cos²', fontsize=22, title_fontsize=24, markerscale=3, loc='best')
for text in legend1.get_texts():
    text.set_fontsize(22)
legend1.set_title('cos²', prop={'size': 24})

# ----------------------------
# 2：RainTomorrow
ax2 = axes[0, 1]
ax2.set_facecolor('white')  
scatter2 = sns.scatterplot(
    x=principal_components_original[:, 0],
    y=principal_components_original[:, 1],
    hue=target,
    palette='RdYlBu',
    alpha=0.6,
    edgecolor=None,
    style=target,
    markers={0: 'o', 1: 'X'},
    s=2,
    ax=ax2
)
ax2.set_xlabel(f'Dim1 ({pca_original.explained_variance_ratio_[0] * 100:.1f}%)')
ax2.set_ylabel(f'Dim2 ({pca_original.explained_variance_ratio_[1] * 100:.1f}%)')
ax2.set_title('Individuals - PCA (Original Dataset, Groups)')
ax2.axhline(0, linestyle='--', color='black', linewidth=0.5)
ax2.axvline(0, linestyle='--', color='black', linewidth=0.5)

# adjust RainTomorrow 
legend2 = ax2.legend(title='RainTomorrow', fontsize=22, title_fontsize=24, markerscale=15)
for text in legend2.get_texts():
    text.set_fontsize(22)
legend2.set_title('RainTomorrow', prop={'size': 24})

# ----------------------------
# 3：cos²
ax3 = axes[1, 0]
ax3.set_facecolor('white')  
scatter3 = sns.scatterplot(
    x=principal_components_balanced[:, 0],
    y=principal_components_balanced[:, 1],
    hue=cos2_balanced,
    palette='RdYlBu',
    alpha=0.6,
    edgecolor=None,
    ax=ax3
)
ax3.set_xlabel(f'Dim1 ({pca_balanced.explained_variance_ratio_[0] * 100:.1f}%)')
ax3.set_ylabel(f'Dim2 ({pca_balanced.explained_variance_ratio_[1] * 100:.1f}%)')
ax3.set_title('Individuals - PCA (Balanced Dataset)')
ax3.axhline(0, linestyle='--', color='black', linewidth=0.5)
ax3.axvline(0, linestyle='--', color='black', linewidth=0.5)

# adjust cos2_balanced 
legend3 = ax3.legend(title='cos²', fontsize=22, title_fontsize=24, markerscale=3, loc='best')
for text in legend3.get_texts():
    text.set_fontsize(22)
legend3.set_title('cos²', prop={'size': 24})

# ----------------------------
# 4：RainTomorrow
ax4 = axes[1, 1]
ax4.set_facecolor('white')  
scatter4 = sns.scatterplot(
    x=principal_components_balanced[:, 0],
    y=principal_components_balanced[:, 1],
    hue=target_balanced,
    palette='RdYlBu',
    alpha=0.6,
    edgecolor=None,
    style=target_balanced,
    markers={0: 'o', 1: 'X'},
    s=2,
    ax=ax4
)
ax4.set_xlabel(f'Dim1 ({pca_balanced.explained_variance_ratio_[0] * 100:.1f}%)')
ax4.set_ylabel(f'Dim2 ({pca_balanced.explained_variance_ratio_[1] * 100:.1f}%)')
ax4.set_title('Individuals - PCA (Balanced Dataset, Groups)')
ax4.axhline(0, linestyle='--', color='black', linewidth=0.5)
ax4.axvline(0, linestyle='--', color='black', linewidth=0.5)

# adjust RainTomorrow_balanced 
legend4 = ax4.legend(title='RainTomorrow', fontsize=22, title_fontsize=24, markerscale=15)
for text in legend4.get_texts():
    text.set_fontsize(22)
legend4.set_title('RainTomorrow', prop={'size': 24})

plt.tight_layout()

# plt.savefig('pca_plots.png', transparent=True)

plt.show()