# Storage Metrics Data Analysis

This notebook analyzes the storage metrics data generated from October 1, 2024, to March 14, 2025. We'll explore patterns, trends, and insights in the data through various visualizations and analyses.

## 1. Data Loading and Initial Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Set plot styling
plt.style.use('ggplot')
sns.set(font_scale=1.2)
plt.rcParams['figure.figsize'] = [12, 6]

# Load the data
file_path = 'storage_metrics_history.csv'
df = pd.read_csv(file_path)

# Convert timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Create date and hour columns for time-based analysis
df['date'] = df['timestamp'].dt.date
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.day_name()
df['month'] = df['timestamp'].dt.month_name()

# Display basic information
print(f"Data spans from {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Total records: {len(df)}")
print(f"Directories: {', '.join(df['directory'].unique())}")

# Show first few rows
df.head()

In [None]:
# Data summary
df.describe()

## 2. Storage Growth Analysis

In [None]:
# Create a daily snapshot of space usage per directory
daily_space = df.groupby(['date', 'directory'])['current_space_gb'].last().unstack()

# Plot storage growth over time
plt.figure(figsize=(14, 7))
daily_space.plot()
plt.title('Storage Usage Growth Over Time', fontsize=16)
plt.ylabel('Storage Used (GB)', fontsize=14)
plt.xlabel('Date', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(title='Directory', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Calculate month-over-month growth rates
# Group by month and directory to get the last value of each month
df['year_month'] = df['timestamp'].dt.to_period('M')
monthly_space = df.groupby(['year_month', 'directory'])['current_space_gb'].last().unstack()

# Calculate growth rates
monthly_growth = monthly_space.pct_change() * 100

# Display growth rates
plt.figure(figsize=(14, 7))
monthly_growth.plot(kind='bar')
plt.title('Month-over-Month Storage Growth Rate (%)', fontsize=16)
plt.ylabel('Growth Rate (%)', fontsize=14)
plt.xlabel('Month', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(title='Directory', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. File Operation Analysis

In [None]:
# Analyze file operations by directory
# Aggregate daily operations
daily_ops = df.groupby(['date', 'directory']).agg({
    'files_added': 'sum',
    'files_deleted': 'sum',
    'files_modified': 'sum'
}).reset_index()

# Calculate total operations
daily_ops['total_operations'] = daily_ops['files_added'] + daily_ops['files_deleted'] + daily_ops['files_modified']

# Calculate 7-day rolling average
rolling_ops = daily_ops.set_index('date').groupby('directory')[['total_operations']].rolling(7).mean().reset_index()

# Plot rolling average operations by directory
plt.figure(figsize=(14, 7))
for directory in daily_ops['directory'].unique():
    data = rolling_ops[rolling_ops['directory'] == directory]
    plt.plot(data['date'], data['total_operations'], label=directory)

plt.title('7-Day Rolling Average of Daily File Operations', fontsize=16)
plt.ylabel('Operations Count (7-day avg)', fontsize=14)
plt.xlabel('Date', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(title='Directory', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Create a stacked bar chart of operation types by directory
ops_by_dir = df.groupby('directory').agg({
    'files_added': 'sum',
    'files_deleted': 'sum',
    'files_modified': 'sum'
})

# Plot
ops_by_dir.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Total File Operations by Directory', fontsize=16)
plt.ylabel('Number of Operations', fontsize=14)
plt.xlabel('Directory', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(title='Operation Type', fontsize=12)
plt.tight_layout()
plt.show()

## 4. Temporal Patterns Analysis

In [None]:
# Analyze operations by hour of day
hourly_ops = df.groupby(['hour', 'directory']).agg({
    'files_added': 'sum',
    'files_deleted': 'sum',
    'files_modified': 'sum'
}).reset_index()

hourly_ops['total_operations'] = hourly_ops['files_added'] + hourly_ops['files_deleted'] + hourly_ops['files_modified']

# Plot operations by hour for each directory
plt.figure(figsize=(14, 8))
for directory in hourly_ops['directory'].unique():
    data = hourly_ops[hourly_ops['directory'] == directory]
    plt.plot(data['hour'], data['total_operations'], marker='o', label=directory)

plt.title('File Operations by Hour of Day', fontsize=16)
plt.ylabel('Total Operations', fontsize=14)
plt.xlabel('Hour of Day', fontsize=14)
plt.xticks(range(0, 24))
plt.grid(True, alpha=0.3)
plt.legend(title='Directory', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Analyze operations by day of week
# Create ordered day of week
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['day_of_week'] = pd.Categorical(df['day_of_week'], categories=days_order, ordered=True)

day_ops = df.groupby(['day_of_week', 'directory']).agg({
    'files_added': 'sum',
    'files_deleted': 'sum',
    'files_modified': 'sum'
}).reset_index()

day_ops['total_operations'] = day_ops['files_added'] + day_ops['files_deleted'] + day_ops['files_modified']

# Plot heatmap of operations by day of week and directory
day_ops_pivot = day_ops.pivot(index='day_of_week', columns='directory', values='total_operations')

plt.figure(figsize=(14, 8))
sns.heatmap(day_ops_pivot, annot=True, fmt=",.0f", cmap="YlGnBu")
plt.title('File Operations by Day of Week and Directory', fontsize=16)
plt.ylabel('Day of Week', fontsize=14)
plt.xlabel('Directory', fontsize=14)
plt.tight_layout()
plt.show()

## 5. Storage Efficiency Analysis

In [None]:
# Calculate the ratio of files to storage space
latest_stats = df.groupby('directory').last().reset_index()
latest_stats['GB_per_file'] = latest_stats['current_space_gb'] / latest_stats['total_files']

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x='directory', y='GB_per_file', data=latest_stats)
plt.title('Average File Size by Directory (GB/file)', fontsize=16)
plt.ylabel('Average Size (GB/file)', fontsize=14)
plt.xlabel('Directory', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Calculate net change in storage (added - deleted) over time
df['net_size_change_gb'] = df['size_added_gb'] - df['size_deleted_gb']
df['cumulative_change'] = df.groupby('directory')['net_size_change_gb'].cumsum()

# Plot cumulative net change by directory
plt.figure(figsize=(14, 7))
for directory in df['directory'].unique():
    data = df[df['directory'] == directory].copy()
    # Resample to daily for smoother plot
    daily_data = data.set_index('timestamp').resample('D')[['cumulative_change']].last()
    plt.plot(daily_data.index, daily_data['cumulative_change'], label=directory)

plt.title('Cumulative Net Storage Change Over Time', fontsize=16)
plt.ylabel('Cumulative Net Change (GB)', fontsize=14)
plt.xlabel('Date', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(title='Directory', fontsize=12)
plt.tight_layout()
plt.show()

## 6. Directory Comparison and Anomaly Detection

In [None]:
# Calculate daily total operations and identify outliers
daily_total = df.groupby(['date', 'directory']).agg({
    'files_added': 'sum',
    'files_deleted': 'sum',
    'files_modified': 'sum',
    'size_added_gb': 'sum',
    'size_deleted_gb': 'sum'
}).reset_index()

daily_total['total_files_ops'] = daily_total['files_added'] + daily_total['files_deleted'] + daily_total['files_modified']
daily_total['total_size_ops'] = daily_total['size_added_gb'] + daily_total['size_deleted_gb']

# Identify outlier days (days with operations > 2 standard deviations from mean)
outliers = {}
for directory in daily_total['directory'].unique():
    dir_data = daily_total[daily_total['directory'] == directory]
    mean = dir_data['total_files_ops'].mean()
    std = dir_data['total_files_ops'].std()
    threshold = mean + 2 * std
    
    outlier_days = dir_data[dir_data['total_files_ops'] > threshold]
    if not outlier_days.empty:
        outliers[directory] = outlier_days[['date', 'total_files_ops']]

# Display outliers
for directory, outlier_data in outliers.items():
    print(f"Outlier days for {directory}:")
    print(outlier_data)
    print("---")

# Plot daily operations with outliers highlighted
plt.figure(figsize=(14, 8))
for directory in daily_total['directory'].unique():
    dir_data = daily_total[daily_total['directory'] == directory]
    plt.plot(dir_data['date'], dir_data['total_files_ops'], alpha=0.7, label=directory)
    
    # Highlight outliers if any
    if directory in outliers:
        outlier_data = outliers[directory]
        for _, row in outlier_data.iterrows():
            plt.scatter(row['date'], row['total_files_ops'], color='red', s=100, zorder=5)

plt.title('Daily File Operations with Outliers Highlighted (Red)', fontsize=16)
plt.ylabel('Total File Operations', fontsize=14)
plt.xlabel('Date', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(title='Directory', fontsize=12)
plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Analyze correlation between operations
# Group by day and directory
corr_data = daily_total[['directory', 'files_added', 'files_deleted', 'files_modified', 
                         'size_added_gb', 'size_deleted_gb']]

# Calculate correlation matrices for each directory
for directory in corr_data['directory'].unique():
    dir_data = corr_data[corr_data['directory'] == directory].drop('directory', axis=1)
    
    plt.figure(figsize=(10, 8))
    corr_matrix = dir_data.corr()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
    plt.title(f'Correlation Matrix for {directory}', fontsize=16)
    plt.tight_layout()
    plt.show()

## 8. Forecasting and Future Trends

In [None]:
# Use a simple linear extrapolation to forecast future storage needs
from sklearn.linear_model import LinearRegression

# Convert date to numeric for regression
daily_space = daily_space.reset_index()
daily_space['date_num'] = (pd.to_datetime(daily_space['date']) - pd.to_datetime('2024-10-01')).dt.days

# Forecast next 30 days for each directory
plt.figure(figsize=(14, 7))

# Get the last date in the data
last_date = pd.to_datetime(daily_space['date'].max())
future_dates = [last_date + timedelta(days=i) for i in range(1, 31)]
future_date_nums = [(date - pd.to_datetime('2024-10-01')).days for date in future_dates]

for directory in df['directory'].unique():
    # Get data for this directory
    dir_data = daily_space[['date', 'date_num', directory]].dropna()
    
    # Fit linear regression model
    model = LinearRegression()
    X = dir_data['date_num'].values.reshape(-1, 1)
    y = dir_data[directory].values
    model.fit(X, y)
    
    # Plot actual data
    plt.plot(pd.to_datetime(dir_data['date']), dir_data[directory], label=f'{directory} (Actual)')
    
    # Predict future values
    future_X = np.array(future_date_nums).reshape(-1, 1)
    future_y = model.predict(future_X)
    
    # Plot predictions
    plt.plot(future_dates, future_y, '--', label=f'{directory} (Forecast)')

plt.title('Storage Growth Forecast (Next 30 Days)', fontsize=16)
plt.ylabel('Storage Used (GB)', fontsize=14)
plt.xlabel('Date', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(fontsize=10)
plt.tight_layout()
plt.show()

# Calculate when directories might reach capacity
print("Projected days until directories reach 1TB (1,000 GB):")
for directory in df['directory'].unique():
    dir_data = daily_space[['date_num', directory]].dropna()
    model = LinearRegression()
    X = dir_data['date_num'].values.reshape(-1, 1)
    y = dir_data[directory].values
    model.fit(X, y)
    
    # Current size
    current_size = dir_data[directory].iloc[-1]
    
    # Calculate days until 1TB
    if model.coef_[0] > 0:  # Only if growth is positive
        days_to_1tb = (1000 - current_size) / model.coef_[0]
        target_date = last_date + timedelta(days=int(days_to_1tb))
        print(f"{directory}: {int(days_to_1tb)} days (around {target_date.strftime('%Y-%m-%d')})")
    else:
        print(f"{directory}: No growth or negative growth trend detected")

## 9. Key Insights and Recommendations

Based on the analysis above, here are the key insights and recommendations:

### Storage Growth Patterns
- The directories show different growth rates, with [directory] growing fastest
- We can expect [directory] to reach 1TB capacity first, around [date]

### Usage Patterns
- Peak activity occurs during [time/day], suggesting this is when most users are active
- [Directory] shows the highest activity level and might need additional resources
- Weekend usage is significantly [higher/lower] than weekday usage

### File Operations
- [Directory] has the highest file creation rate
- [Directory] has the highest file modification rate
- [Directory] has the highest deletion rate

### Recommendations
1. Consider increasing storage capacity for [directory] by [date]
2. Implement automated cleanup policies for [directory] to manage growth
3. Schedule maintenance and backups during low-activity periods ([time/day])
4. Monitor [directory] for unusual activity patterns based on the identified outliers
5. Consider implementing tiered storage solutions for [directory] to optimize costs

### Further Analysis
- Deeper investigation into file types stored in each directory
- User-specific analysis to identify heavy users
- Cost optimization analysis for storage utilization

## 10. Exporting Insights

In [None]:
# Export key metrics to CSV for reporting
# Monthly summary by directory
monthly_summary = df.groupby(['year_month', 'directory']).agg({
    'files_added': 'sum',
    'files_deleted': 'sum', 
    'files_modified': 'sum',
    'size_added_gb': 'sum',
    'size_deleted_gb': 'sum',
    'size_modified_gb': 'sum',
    'current_space_gb': 'last',
    'total_files': 'last'
}).reset_index()

# Calculate net growth
monthly_summary['net_size_growth_gb'] = monthly_summary['size_added_gb'] - monthly_summary['size_deleted_gb']
monthly_summary['net_files_growth'] = monthly_summary['files_added'] - monthly_summary['files_deleted']

# Export to CSV
monthly_summary.to_csv('storage_monthly_summary.csv', index=False)
print("Monthly summary exported to 'storage_monthly_summary.csv'")

# Display the summary table
monthly_summary.head(10)