In [1]:
# Average Timeseries

from datetime import datetime

# Example timestamps (replace with your actual timestamps)
timestamps = [
    "2022-01-15 08:00:00",
    "2022-01-15 12:30:00",
    "2022-01-15 18:45:00"
]

# Convert timestamps to Unix timestamps
numeric_values = [datetime.strptime(ts, "%Y-%m-%d %H:%M:%S").timestamp() for ts in timestamps]

# Calculate average
average_numeric = sum(numeric_values) / len(numeric_values)

# Convert average back to timestamp
average_timestamp = datetime.fromtimestamp(average_numeric).strftime("%Y-%m-%d %H:%M:%S")

In [None]:
# using numpy
import numpy as np

# Example timestamps (replace with your actual timestamps)
timestamps = [
    "08:00:00",
    "12:30:00",
    "18:45:00"
]

# Convert timestamps to total seconds
total_seconds = [int(ts.split(":")[0])*3600 + int(ts.split(":")[1])*60 + int(ts.split(":")[2]) for ts in timestamps]

# Calculate average using np.mean()
average_seconds = np.mean(total_seconds)

# Convert average back to timestamp
average_timestamp = "{:02}:{:02}:{:02}".format(int(average_seconds // 3600), int((average_seconds % 3600) // 60), int(average_seconds % 60))

print("Average Timestamp:", average_timestamp)

In [None]:
# Synthetic Data
import torch
from torch import nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import torch.nn.init as init
import pandas as pd
import numpy as np
from torch.utils.data import Dataset


# defining a single generation block function
def FC_Layer_blockGen(input_dim, output_dim):
    single_block = nn.Sequential(
        nn.Linear(input_dim, output_dim),

        nn.ReLU()
    )
    return single_block
    
# DEFINING THE GENERATOR
class Generator(nn.Module):
    def __init__(self, latent_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
            nn.Tanh()  
        )

    def forward(self, x):
        return self.model(x)
        
#defining a single discriminattor block       
def FC_Layer_BlockDisc(input_dim, output_dim):
    return nn.Sequential(
        nn.Linear(input_dim, output_dim),
        nn.ReLU(),
        nn.Dropout(0.4)
    )
    
# Defining the discriminator

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)
        
        
#Defining training parameters
batch_size = 128
num_epochs = 500
lr = 0.0002
num_features = 6
latent_dim = 20

# MODEL INITIALIZATION
generator = Generator(noise_dim, num_features)
discriminator = Discriminator(num_features)

# LOSS FUNCTION AND OPTIMIZERS
criterion = nn.BCELoss()
gen_optimizer = torch.optim.Adam(generator.parameters(), lr=lr)
disc_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lr)

In [None]:
# IMPORTING DATA
file_path = 'SamplingData7.xlsx'
data = pd.read_excel(file_path)
X = data.values
X_normalized = torch.FloatTensor((X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) * 2 - 1)
real_data = X_normalized

#Creating a dataset

class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe.values.astype(float)
        self.labels = dataframe.values.astype(float)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = {
            'input': torch.tensor(self.data[idx]),
            'label': torch.tensor(self.labels[idx])
        }
        return sample

# Create an instance of the dataset
dataset = MyDataset(data)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

def weights_init(m):
    if isinstance(m, nn.Linear):
        init.xavier_uniform_(m.weight)
        if m.bias is not None:
            init.constant_(m.bias, 0)

pretrained = False
if pretrained:
    pre_dict = torch.load('pretrained_model.pth')
    generator.load_state_dict(pre_dict['generator'])
    discriminator.load_state_dict(pre_dict['discriminator'])
else:
    # Apply weight initialization
    generator = generator.apply(weights_init)
    discriminator = discriminator.apply(weights_init)

In [None]:
model_save_freq = 100

latent_dim =20
for epoch in range(num_epochs):
    for batch in dataloader:
        real_data_batch = batch['input']
        # Train discriminator on real data
        real_labels = torch.FloatTensor(np.random.uniform(0.9, 1.0, (batch_size, 1)))
        disc_optimizer.zero_grad()
        output_real = discriminator(real_data_batch)
        loss_real = criterion(output_real, real_labels)
        loss_real.backward()

        # Train discriminator on generated data
        fake_labels = torch.FloatTensor(np.random.uniform(0, 0.1, (batch_size, 1)))
        noise = torch.FloatTensor(np.random.normal(0, 1, (batch_size, latent_dim)))
        generated_data = generator(noise)
        output_fake = discriminator(generated_data.detach())
        loss_fake = criterion(output_fake, fake_labels)
        loss_fake.backward()

        disc_optimizer.step()

        # Train generator 
        valid_labels = torch.FloatTensor(np.random.uniform(0.9, 1.0, (batch_size, 1)))
        gen_optimizer.zero_grad()
        output_g = discriminator(generated_data)
        loss_g = criterion(output_g, valid_labels)
        loss_g.backward()
        gen_optimizer.step()

    # Print progress
    print(f"Epoch {epoch}, D Loss Real: {loss_real.item()}, D Loss Fake: {loss_fake.item()}, G Loss: {loss_g.item()}")

In [None]:
import seaborn as sns

# Generate synthetic data 
synthetic_data = generator(torch.FloatTensor(np.random.normal(0, 1, (real_data.shape[0], noise_dim))))

# Plot the results
fig, axs = plt.subplots(2, 3, figsize=(12, 8))
fig.suptitle('Real and Synthetic Data Distributions', fontsize=16)

for i in range(2):
    for j in range(3):
        sns.histplot(synthetic_data[:, i * 3 + j].detach().numpy(), bins=50, alpha=0.5, label='Synthetic Data', ax=axs[i, j], color='blue')
        sns.histplot(real_data[:, i * 3 + j].numpy(), bins=50, alpha=0.5, label='Real Data', ax=axs[i, j], color='orange')
        axs[i, j].set_title(f'Parameter {i * 3 + j + 1}', fontsize=12)
        axs[i, j].set_xlabel('Value')
        axs[i, j].set_ylabel('Frequency')
        axs[i, j].legend()

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


# Create a 2x3 grid of subplots
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Comparison of Real and Synthetic Data', fontsize=16)

# Define parameter names
param_names = ['Parameter 1', 'Parameter 2', 'Parameter 3', 'Parameter 4', 'Parameter 5', 'Parameter 6']

# Scatter plots for each parameter
for i in range(2):
    for j in range(3):
        param_index = i * 3 + j
        sns.scatterplot(real_data[:, 0].numpy(), real_data[:, param_index].numpy(), label='Real Data', alpha=0.5, ax=axs[i, j])
        sns.scatterplot(synthetic_data[:, 0].detach().numpy(), synthetic_data[:, param_index].detach().numpy(), label='Generated Data', alpha=0.5, ax=axs[i, j])
        axs[i, j].set_title(param_names[param_index], fontsize=12)
        axs[i, j].set_xlabel(f'Real Data - {param_names[param_index]}')
        axs[i, j].set_ylabel(f'Real Data - {param_names[param_index]}')
        axs[i, j].legend()

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


* I've been researching the complexity of generating synthetic data. While creating it from scratch is challenging, I've explored GAN methods, specifically using PyTorch. I plan to experiment with this approach, although its applicability to our dataset is yet to be determined."

* "Setting a threshold for isolation forest has been a challenge. Following Ken's suggestion, I've removed the contamination level and approached the code from different angles to address this difficulty."

* "Although we temporarily dropped the duration column from our dataset, I'm considering using the mean value of duration as a feature for isolation forest. This might reveal interesting insights, and I'll be working on implementing this."

* "I've recognized the importance of setting hypotheses and considering potential attacking scenarios for generating synthetic data. I've summarized some scenarios relevant to our dataset that we should be mindful of."

* "Regarding task prioritization, I'm feeling a bit lost about where to start. While I understand we need to address all discussed tasks, I would appreciate guidance on the order of priority. Any insights on what should be tackled first would be helpful."

merge 이해하기

Isolation FOrest 직접 구현

합성데이터 직접 구현

https://towardsdatascience.com/generative-ai-synthetic-data-generation-with-gans-using-pytorch-2e4dde8a17dd

https://www.youtube.com/watch?v=Gg0gH3-Q4Wk&ab_channel=Unit8




###01/19 Update

https://blog.eunsukim.me/posts/what-is-accuracy-recall-precision-and-f1-score 

https://www.youtube.com/watch?v=puVdwi5PjVA&ab_channel=%EA%B3%A0%EB%A0%A4%EB%8C%80%ED%95%99%EA%B5%90%EC%82%B0%EC%97%85%EA%B2%BD%EC%98%81%EA%B3%B5%ED%95%99%EB%B6%80DSBA%EC%97%B0%EA%B5%AC%EC%8B%A4
 

https://www.youtube.com/watch?v=VZWQfQHsGGY&ab_channel=%E2%80%8D%EA%B9%80%EC%84%B1%EB%B2%94%5B%EA%B5%90%EC%88%98%2F%EC%82%B0%EC%97%85%EA%B2%BD%EC%98%81%EA%B3%B5%ED%95%99%EB%B6%80%5D 



In [None]:
# Calculate metrics
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_score, recall_score, f1_score

# Step 1: Train the Isolation Forest Model
iso_forest = IsolationForest()
iso_forest.fit(X)

# Step 2: Make Predictions
predictions = iso_forest.predict(X)

# Step 3: Calculate Metrics
# Assuming your true labels are stored in 'y_true'
f1 = f1_score(y_true, predictions)
recall = recall_score(y_true, predictions)
precision = precision_score(y_true, predictions)

print(f'F1-score: {f1}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')

In [None]:
# optimal threshold
from sklearn.metrics import f1_score, precision_recall_curve

# Assuming 'y_true' and 'predictions' are defined
precision, recall, thresholds = precision_recall_curve(y_true, predictions)
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[np.argmax(f1_scores)]

print(f'Optimal Threshold (F1): {optimal_threshold}')


In [None]:
# Optimal Threshold for Precision
optimal_threshold_precision = thresholds[np.argmax(precision)]
print(f'Optimal Threshold (Precision): {optimal_threshold_precision}')

# Optimal Threshold for Recall
optimal_threshold_recall = thresholds[np.argmax(recall)]
print(f'Optimal Threshold (Recall): {optimal_threshold_recall}')


In [None]:
# Update 2/15/2024

from pyod.models.iforest import IForest


iforest=IForest(n_estimators=300, contatimation=0.05, random_state=10)

iforest.fit(X)

# outlier labels
labels = iforest.fit_predict(X)

outliers = X[labels==1]

print(outliers.shape)


In [None]:

#calculate probabilities
# Alternative to isolating outliers with contatimation = outlier probability
iforest = IForest(random_state=10).fit(X)

# Calculate probabilities
probs = iforest.predict_proba(X)

# Extract the probabilities for outliers
outlier_probs = probs[:,1]

# Filter for when the probability is higher than 70%
outliers = X[outlier_probs>.7]

print(len(outliers))

In [None]:
# KNN

from pyod.models.knn import KNN

# Instantiate KNN and fit to females
knn = KNN(contamination=0.005, n_neighbors=20, n_jobs=-1)
knn.fit(X)

# Create a boolean index that checks for outliers
is_outlier = knn.labels_ == 1

# Isolate the outliers
outliers = X[is_outlier]

print(len(outliers))

In [None]:

# Instantiate a KNN with 20 neighbors and fit to `X`
knn = KNN(n_neighbors = 20, n_jobs=-1)
knn.fit(X)

# Calculate probabilities
probs = knn.predict_proba(X)

# Create a boolean mask
is_outlier = probs[:,1] > .55
# Use the boolean mask to fier the outliers
outliers = X[is_outlier]

print(len(outliers))


In [None]:
def evaluate_outlier_classifier(model, data, threshold=.75):
    model.fit(data)

    probs = model.predict_proba(data)
    inliers = data[probs[:, 1] <= threshold]

    return inliers

def evaluate_regressor(inliers):
    X, y = inliers.drop("weightkg", axis=1), inliers[['weightkg']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, train_size=0.8)

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    preds = lr.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)

    return round(rmse, 3)

In [None]:
# Tuning n_neighbors

# Create a list of values for n_neigbors
n_neighbors = [5,10,20]
scores = dict()

for k in n_neighbors:
    # Instantiate KNN with the current k
    knn = KNN(n_neighbors=k, n_jobs=-1)
    
    # Find the inliers with the current KNN
    inliers = evaluate_outlier_classifier(knn, females_transformed, .50)
    
    # Calculate and store RMSE into scores
    scores[k] = evaluate_regressor(inliers)
    
print(scores)

In [None]:
# Tuning aggregation method
'''
Once the optimal number of neighbors is found, it's time to tune the distance aggregating method. If n_neighbors is 10, each datapoint will have ten distance measurements to its nearest neighbors. KNN uses three methods to aggregate those distances: largest, mean, and median.
'''

n_neighbors = [5, 20]
methods = ['largest', 'mean', 'median']
scores = dict()

for k, m in product(n_neighbors,methods):
    # Create a KNN instance
    knn = KNN(n_neighbors=k,method=m,n_jobs=-1)
    
    # Find the inliers with the current KNN
    inliers = evaluate_outlier_classifier(knn,females_transformed, .5)

    # Calculate and store RMSE into scores
    scores[(k, m)] = evaluate_regressor(inliers)
    
print(scores)

In [None]:
#LOF
# Import LOF from its relevant module
from pyod.models.lof import LOF

# Instantiate LOF and fit to females_transformed
lof = LOF(contamination=.003, n_jobs=-1)
lof.fit(X_transformed)

# Create a boolean index that checks for outliers
is_outlier = lof.labels_ == 1

# Isolate the outliers
outliers = X_transformed[is_outlier]

print(len(outliers))

In [None]:
lof = LOF(n_neighbors=20)
lof.fit(females_transformed)

# Calculate probabilities
probs = lof.predict_proba(females_transformed)

# Create a boolean mask
is_outlier = probs[:,1]>.5

# Use the boolean mask to filter the outliers
outliers = females_transformed[is_outlier]

print(len(outliers))

In [None]:
# Data Scaling

from sklearn.preprocessing import StandardScaler

# Initialize a StandardScaler
ss = StandardScaler()

# Extract feature and target arrays
X = X.drop('target',axis=1)
y = X[['target']]

# Fit/transform X
X_transformed = ss.fit_transform(X)

# Fit/transform X but preserve the column names
X.loc[:,:] = ss.fit_transform(X)

In [None]:
#QUntileTransformer
'''
Standardization is prone to the same pitfalls as z-scores. Both use mean and standardization in their calculations, which makes them highly sensitive to extreme values.

To get around this problem, you should use QuantileTransformer which uses quantiles. Quantiles of a distribution stay the same regardless of the magnitude of outliers.

You should use StandardScaler when the data is normally distributed (which can be checked with a histogram). For other distributions, QuantileTransformer is a better choice.

'''
from sklearn.preprocessing import QuantileTransformer

# Instantiate an instance that casts to normal
qt = QuantileTransformer(output_distribution='normal')

# Fit and transform the feature array
X.loc[:,:] = qt.fit_transform(X)

# Plot a histogram of palm length
plt.hist(X['target'], color='red')

plt.xlabel("Target")
plt.show()

In [None]:
#Handling Timeseries

# Convert the Date column to DateTime
apple['Date'] = pd.to_datetime(apple['Date'])

# Create a column for the day of the week
apple['day_of_week'] = apple['Date'].dt.day_of_week

# Create a column for the month
apple['month'] = apple['Date'].dt.month
# Create a column for the day of the month
apple['day_of_month'] =apple['Date'].dt.day
 

print(apple[['day_of_week', 'month', 'day_of_month']])

# Convert the Date column to DateTime
apple['Date'] = pd.to_datetime(apple['Date'])

# Create a column for the day of the week
apple['day_of_week'] = apple['Date'].dt.day_of_week

# Create a column for the month
apple['month'] = apple['Date'].dt.month
# Create a column for the day of the month
apple['day_of_month'] =apple['Date'].dt.day
 

print(apple[['day_of_week', 'month', 'day_of_month']])

In [None]:
# Load the Apple stocks dataset with a proper DatatimeIndex
apple = pd.read_csv('aapl.csv',parse_dates=['Date'],index_col='Date')
# Create three new features from the DatetimeIndex
apple['day_of_week'] = apple.index.day_of_week
apple['month'] = apple.index.month
apple['day_of_month'] = apple.index.day

In [None]:
#Decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

# Create a DecomposeResult object and plot seasonality
results = seasonal_decompose(apple['Volume']["2010": "2012"], period=365)

results.seasonal.plot(color="red", figsize=(12, 4))

plt.show()


In [None]:
results = seasonal_decompose(apple['Volume'], period=365)

# Extract and reshape the residuals
residuals = results.resid
residuals = residuals.values.reshape(-1,1)

In [None]:
# Aggregating, thresholding and probs

# Find the mean across rows
mean_probs = np.mean(probability_scores,axis=1)

# Create a boolean mask that uses a 75% threshold
is_outlier = mean_probs > .75

# Use the mask to filter outliers from apple
outliers = apple[is_outlier]

print(len(outliers))