In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore, spearmanr, pearsonr
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing


In [None]:
# Replace 'data.csv' with the actual path to your dataset
data = pd.read_csv('data.csv')

# Display basic information about the dataset
print(data.info())
print(data.describe())

# Quick look at the first few rows
data.head()


In [None]:
# Handle missing values using time-based mean imputation
data.interpolate(method='time', inplace=True)

# Detect outliers using Z-score analysis (removing points beyond 3 standard deviations)
z_scores = np.abs(zscore(data.select_dtypes(include=[np.number])))
data_cleaned = data[(z_scores < 3).all(axis=1)]

# Check the size of the dataset after cleaning
print(f"Original dataset size: {data.shape}")
print(f"Cleaned dataset size: {data_cleaned.shape}")


In [None]:
# Normalize numerical columns (if needed) using StandardScaler
scaler = StandardScaler()
numerical_cols = data_cleaned.select_dtypes(include=[np.number]).columns
data_cleaned[numerical_cols] = scaler.fit_transform(data_cleaned[numerical_cols])

# Apply log transformation to skewed data
power_transformer = PowerTransformer(method='yeo-johnson')
data_cleaned[numerical_cols] = power_transformer.fit_transform(data_cleaned[numerical_cols])


In [None]:
# Calculate Pearson correlation for linear relationships
correlation_matrix = data_cleaned.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title('Correlation Matrix')
plt.show()

# Example: Spearman correlation for non-linear relationships
temp = data_cleaned['Temperature']
humidity = data_cleaned['Humidity']
corr, _ = spearmanr(temp, humidity)
print(f"Spearman correlation between Temperature and Humidity: {corr:.2f}")


In [None]:
# Ensure the data is time-indexed
data_cleaned['Timestamp'] = pd.to_datetime(data_cleaned['Timestamp'])
data_cleaned.set_index('Timestamp', inplace=True)

# Test stationarity (Augmented Dickey-Fuller test)
result = adfuller(data_cleaned['Temperature'])
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')

# ARIMA Model for forecasting
arima_model = ARIMA(data_cleaned['Temperature'], order=(1, 1, 1))
arima_fit = arima_model.fit()
print(arima_fit.summary())

# Plot forecast
forecast = arima_fit.forecast(steps=30)
plt.plot(data_cleaned['Temperature'], label="Actual")
plt.plot(forecast, label="Forecast", color='red')
plt.legend()
plt.show()


In [None]:
# Linear Regression
X = data_cleaned[['Pressure', 'Humidity']]  # Replace with predictors
y = data_cleaned['Temperature']  # Replace with target variable
reg = LinearRegression()
reg.fit(X, y)
print(f"Linear Regression Coefficients: {reg.coef_}")

# Ridge Regression (to handle multicollinearity)
ridge = Ridge(alpha=1.0)
ridge.fit(X, y)
print(f"Ridge Regression Coefficients: {ridge.coef_}")


In [None]:
# Principal Component Analysis (PCA)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data_cleaned[numerical_cols])
print(f"Explained Variance by Components: {pca.explained_variance_ratio_}")

# Scatter plot of the PCA components
plt.scatter(pca_result[:, 0], pca_result[:, 1], c='blue', alpha=0.5)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Results')
plt.show()


In [None]:
# K-Means Clustering (e.g., for weather types)
kmeans = KMeans(n_clusters=3, random_state=0)
clusters = kmeans.fit_predict(data_cleaned[numerical_cols])
data_cleaned['Cluster'] = clusters

# Visualize clusters
sns.pairplot(data_cleaned, hue='Cluster', palette='Set1')
plt.show()


In [None]:
# Anomaly detection using rolling Z-scores
rolling_mean = data_cleaned['Temperature'].rolling(window=10).mean()
rolling_std = data_cleaned['Temperature'].rolling(window=10).std()
data_cleaned['Anomaly'] = (np.abs(data_cleaned['Temperature'] - rolling_mean) > 2 * rolling_std).astype(int)

# Plot anomalies
plt.plot(data_cleaned.index, data_cleaned['Temperature'], label='Temperature')
plt.scatter(data_cleaned.index, data_cleaned['Temperature'], c=data_cleaned['Anomaly'], cmap='coolwarm', label='Anomalies')
plt.legend()
plt.show()
