## Reading file into chunks of 10 million

#### The most powerful machine available on Rivanna can hold up to 10 Million rows for data manipulations and we decided to read  the Netflow logs in chunks of 10 millions

In [None]:
import pandas as pd
import os 
os.chdir("/home/rk9cx/LANL/")
file = "netflow_day-02.txt"
chunksize = 10000000
header = ['Time', 'Duration','SrcDevice', 'DstDevice', 'Protocol', 'SrcPort', 'DstPort','SrcPackets','DstPackets',
         'SrcBytes','DstBytes']
df_reader = pd.read_csv(file, chunksize = chunksize, header = None)

## Storing last 10 million rows for deduplication

#### We noticed that records in the Netflow logs are cumulative in nature and we decided to deduplicate. In addition to this, we also noticed that the duplicates could slip between chunks, we are combatting this by storing the information for the last few rows and using it in the next iteration.

In [None]:
lastrow = pd.DataFrame(columns = header)
for cidx, df in enumerate(df_reader):
    final = df
    final.columns = header
    #converting source and destination ports into string as required by the Pandas drop_duplicates function
    final.SrcPort = final.SrcPort.astype(str)
    final.DstPort = final.DstPort.astype(str)
    #when reading the file in chunks, indices are inevitable, the command below removes the indices
    final = pd.concat([lastrow, final])
    final.reset_index(inplace = True)
    final = final.drop(["index"], axis = 1)
    #sorting the values to ensure that logs or recods in a sequence is arranged in ascending order
    final = final.sort_values(['Time','SrcDevice', 'DstDevice', 'Protocol', 'SrcPort', 'DstPort', 'DstBytes'], 
                              ascending=[True, True, True,True,True,True,True])
    #removing duplicates whilst retaining the final/ last row using the timestamp and five tuples 
    final = final.drop_duplicates(subset = ['Time', 'SrcDevice', 'DstDevice', 'Protocol', 'SrcPort', 'DstPort'], 
                       keep = 'last')
    #saving all rows except the last 1 million
    final[:-1000000].to_csv('parsed.csv', mode ='a', header = False, index = False)
    #storing the last 1 million to be used for next iteration
    lastrow = final.tail(1000000)
#adding the lastrow from the iteration to the parsed.csv
lastrow.to_csv('parsed.csv', mode ='a', header = False, index = False)

## Quality Check

#### Due to the existence of duplicates (Time + five tuples) in the logs, its imperative that we check for if there are any duplicates that exist in the parsed file

In [None]:
file = 'parsed.csv'
df_reader1 = pd.read_csv(file, chunksize = 10000000, index_col=False)
for cidx, df in enumerate(df_reader1):
    #number of rows of parsed file
    p = df.shape[0]
    header = [ 'Time', 'Duration','SrcDevice', 'DstDevice', 'Protocol', 'SrcPort', 'DstPort','SrcPackets','DstPackets',
         'SrcBytes','DstBytes']
    c = df
    c.reset_index(drop = True)
    c.columns = header
    c = c.sort_values(['Time', 'Duration','SrcDevice', 'DstDevice', 'Protocol', 'SrcPort', 'DstPort'], ascending=[True,True, True, True,True,True,True])
    c = c.drop_duplicates(subset = ['Time', 'SrcDevice', 'DstDevice', 'Protocol', 'SrcPort', 'DstPort'], 
                       keep = 'last')
    #number of rows of parsed file after removing duplicates
    q = c.shape[0]
    #condition to check if there are any duplicates in the parsed file
    print(p-q)
    if p-q!=0:
        print(cidx)
        print("Quality Check Failed!")
        break