# Pandas - Aggregation, Down-sampling

1. Mean and std. dev. of a algo. group 
2. Avg. Precision, Recall metrics - across an algo
3. Understand 'groupby' object type
4. Write out summary metrics to a .csv

In [None]:
import numpy as np
import pandas as pd

results_file = 'results.csv'
# Import Data
df = pd.read_csv('V_6_2_T1__test_results.csv')
print('Test file imported. Records: ', len(df.index),'\n')
df.columns

### Grouping by Algorithm and computing Precision -- with its mean and std.dev

In [None]:
metrics_pr = df.groupby(['Algorithm']).agg({'Precision' :['mean','std']})
metrics_pr

### Group by Algorithm - compute multiple metrics and their mean, std.dev
- Additionally round off to 3 dec places

In [None]:
metrics = df.groupby(['Algorithm']).agg({'Precision': ['mean','std'], 
                                         'Recall': ['mean','std'],
                                         'F1-Beta': ['mean','std']})
metrics.round(3)

### metrics is a Pandas Data-frame!
- Columns are multi-value tuples
- e.g. (Recall, std): stands for column "std.dev of Recall metric"
- For a particular group-by element i.e. algo., index with it after specifying the column
- ```metrics[('Precision',  'std')]['DQN']```

### Levels of information - from multiple metrics
- ```metrics['Precision']```
- ```metrics['Precision']['mean']```
- ```metrics['Precision']['mean']['DQN']```

In [None]:
metrics.columns

#### Print Precision for all algos

In [None]:
metrics['Precision']

#### Print avg. of Precision for all algos

In [None]:
metrics['Precision']['mean']

#### Print mean of Precision for DQN algo. only
- Round to 3 decimals

In [None]:
round(metrics['Precision']['mean']['DQN'], 3)

## Saving as a report to a .csv

In [None]:
from pathlib import Path  
results_file = Path('pd_reports/report.csv')  
results_file.parent.mkdir(parents=True, exist_ok=True)  

In [None]:
metrics = metrics.round(3)
metrics.to_csv(results_file)

#### Read the report and visualize

In [None]:
df_results = pd.read_csv(results_file)
df_results

## Utility functions 

In [None]:
def compute_metrics(df):
    metrics = df.groupby(['Algorithm']).agg({'Wtd_Precision': ['mean','std'], 'Wtd_Recall': ['mean','std'], 'F1_Beta': ['mean','std'],
                                             'Normal_cases': ['mean'], 'Normal_error': ['mean','std'],
                                             'Replace_cases': ['mean'], 'Replace_error': ['mean','std'],
                                             'Overall_error': ['mean','std']})
    return(metrics)

In [None]:
def write_metrics_report(metrics, report_file, round_decimals=8):
    from pathlib import Path 
    report_file = Path(report_file)  
    report_file.parent.mkdir(parents=True, exist_ok=True)
    metrics = metrics.round(round_decimals)
    metrics.to_csv(results_file)


In [None]:
import numpy as np
import pandas as pd

results_file = 'pd_reports\c_results.csv'
df = pd.read_csv('V_6_2_T1__test_results.csv')
print('Test file imported. Records: ', len(df.index),'\n')

M = compute_metrics(df)
write_metrics_report(M, results_file, 3)

In [None]:
create_column_headings = write_metrics_report

## Down-sampling a large or hi-resolution data-set
- Two techniques to downsample .iloc and index arithmetic based

In [None]:
def downsample(file, sample_rate):
    import pandas as pd
    df = pd.read_csv(file)
    print('Data file imported. Records: ', ,'\n')
    print(f'- Input data records: {len(df.index)}.\n- Sampling rate: {sample_rate}\n- Expected rows {round(len(df.index)/sample_rate)}')
    df_downsampled = df.iloc[::sample_rate, :]
    print(f'- Down-sampled to {len(df_downsampled.index)} rows.')
    return(df_downsampled)


In [49]:
import pandas as pd
import matplotlib.pyplot as plt

DATA_FILE = 'pd_reports\PHM_C6_Expt_Data.csv'
SAMPLE_EVERY = 100

df = pd.read_csv(DATA_FILE)
print('Test file imported. Records: ', len(df.index),'\n')
print(f'Sampling rate: {SAMPLE_EVERY}: Expected rows {round(len(df.index)/SAMPLE_EVERY)}')

Test file imported. Records:  34674 

Sampling rate: 100: Expected rows 347


In [51]:
df_downsampled_2 =  df[df.index % SAMPLE_EVERY == 0]  # Selects every 3rd raw starting from 0

In [52]:
df_downsampled_1['index'] = [n for n in range(len(df_downsampled_1.index))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_downsampled_1['index'] = [n for n in range(len(df_downsampled_1.index))]


In [46]:
print('Technique 1 - Records: ', len(df_downsampled_1.index))
print('Technique 2 - Records: ', len(df_downsampled_2.index))

Technique 1 - Records:  347
Technique 2 - Records:  347


In [53]:
df_downsampled_1.to_csv('PHM_C6_DS.csv', index=False)