# Netflow Analysis Tool

This notebook will analyse netflow files in an identified folder location and return the top 10 talkers.

## Requirements
1. WSL 2.0 installed with a Linux Distro preconfigured.
2. `nfdump` installed on the Linux Distro.
3. `pandas` installed on the system running the notebook.
   
## Workflow
- Ensure the folder contains decompressed neflow files with an nfcap filename.
- Update the notebook to point to the correct folder.

In [1]:
#Step 1: Set up
import pandas as pd
import os
import subprocess
import io

# Path to the directory containing NetFlow files
netflow_directory = './Downloads/netflow/'

In [2]:
def convert_path_to_wsl(windows_path):
    return subprocess.check_output(['wsl', 'wslpath', '-a', windows_path]).decode('utf-8').strip()

In [3]:
# Step 2: Function to process NetFlow files and extract traffic data
def process_netflow_file(file_path):
    wsl_file_path = convert_path_to_wsl(file_path)
    # Use nfdump via WSL to convert NetFlow file to CSV
    csv_output = subprocess.check_output(['wsl', 'nfdump', '-r', wsl_file_path, '-o', 'csv']).decode('utf-8')
    
    #print(csv_output[:1000])  # Print the first 1000 characters of the CSV output for debugging

    # Read the CSV output into a pandas DataFrame
    data = pd.read_csv(io.StringIO(csv_output), sep=',', skiprows=1, engine='python')
    # print(data.head())  # Debugging line
    # print(data.columns)  # Debugging line
    data.columns = data.columns.str.strip()
    return data

In [None]:
# Step 3: Process all NetFlow files in the directory
all_data = []

for filename in os.listdir(netflow_directory):
    if filename.startswith('nfcapd'):
        file_path = os.path.join(netflow_directory, filename)
        try:
            data = process_netflow_file(file_path)
            all_data.append(data)
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

In [7]:
# Combine all data into a single DataFrame
if all_data:
    # Get the intersection of all columns
    common_columns = list(set.intersection(*[set(df.columns) for df in all_data]))
    # Standardize all DataFrames to have the same columns
    all_data = [df[common_columns] for df in all_data]
    
    # Combine all data into a single DataFrame
    combined_data = pd.concat(all_data, ignore_index=True)
else:
    combined_data = pd.DataFrame()

In [8]:
# Print column names to identify the exact names
# print(combined_data.columns) # debugging step

In [None]:
# Step 4: Analyze traffic data
# Clean column names if necessary (remove spaces, special characters, etc.)
# data.columns = data.columns.str.strip() #if needed

# Assuming the column names are 'sa' (Source Address) and 'ibyt' (In Bytes)
combined_data['Src IP Addr'] = combined_data['sa']

# Sum the In Byte column grouped by Src IP Addr
traffic_summary = combined_data.groupby('Src IP Addr')['ibyt'].sum().reset_index()

# Sort the summary by the In Byte column in descending order
traffic_summary = traffic_summary.sort_values(by='ibyt', ascending=False) 

In [None]:
# Display the top endpoints by traffic
traffic_summary.head(10)