# Simple script to group data based on given columns

## Importing libraries

In [None]:
import pandas as pd

## Setting variables

In [None]:
working_folder = r"folder location" # This is the folder that contains the file you're working with
filename = r"uk-new inlinks CSV.csv" # The specific filename (not the full file path)

columns_to_group_by = ["Type", "Destination", "Alt Text", "Anchor", "Target", "Path Type", "Link Position"] # This should just be the columns you want to group by
columns_to_aggregate = ["Source"] # This should be the columns you want to sum and count

columns_to_keep = columns_to_group_by+columns_to_aggregate # If we want to group or aggregate columns we need to keep them!

## Importing data

In [None]:
data = pd.read_csv(working_folder+"/"+filename, sep=",") # Import data using the working folder and file name combined
data.head(5) # Show the top five rows

## Cutting data down to just important columns

In [None]:
working_data = data.copy(deep=True) # Copy data so that we always have an unfiltered version
working_data = working_data[columns_to_keep] # Filter the copy to just the columns we want

working_data = working_data.fillna("") # Removing NA to avoid problems with grouping

working_data.head(5) # Show the top five rows

## Grouping the data by the grouping columns

In [None]:
grouped_data = working_data.groupby(by = columns_to_group_by, sort = False).agg(["count", "min"]) # Group the data
# count the number of records and use "min" to give an example record for each group

grouped_data.reset_index(inplace = True) # Reset the index so we can easily work with the data

grouped_data = grouped_data.sort_values(("Source", "count"), ascending = False) # We need to sort by this nested column
# So we give both the upper column name and the lower column name in a tuple

grouped_data.head(5) # View data

## Exporting the results

In [None]:
grouped_data.to_csv(working_folder+"/grouped_data.csv")