In [3]:
import pandas as pd
import seaborn as sd
import os        #bcz os ki files pe ja rhe hai

**Extracting all entries of pretx to create a single csv file**

In [4]:
# folder containing all files
pretx_folder = 'pre_data'

#storing all files in pretx_list
pretx_list = []
for filename in os.listdir(pretx_folder):     #return a list of the names of the entries
    pretx_list.append(filename)
    #pretx_list[1...n] - to see all the names(or specific 1st or 2nd or nth file) of the files
    #print(filename) - to see all the names of the files

#getting info from file name by defining a func that takes string filename
def filename_to_info(filename:str):
    temp = filename.split("_")   #seperating components of filename and saving in temp
    date = temp[0]
    time = temp[1]
    band = temp[2].split("m")[0]
    return {'date':date , 'time':time , 'band':band}  #Returns a dictionary containing the extracted date, time, and band information from the filename.

#extracting info from file entries
def signal_parse(data):  #getting info from data in the format we want
    '''
    input -> '132045 -19 0.3 592 ~ CQ ON4UR JO11 *Belgium'
    Belgium ,*Belgium -> Belgium
    output-> 
            {'utc': '132045',
             'freq': '592',
             'dt': '0.3',
             'db': '-19',
             'msg': ['CQ', 'ON4UR', 'JO11'],
             'tower': 'ON4UR',
             'country': 'Belgium',}
    '''
    parts = data.split()  #splitting data string and saving in parts
    utc = parts[0]
#     utc_datetime = datetime.strptime(utc, "%H%M%S")
#     utc_hhmmss = utc_datetime.strftime("%H:%M:%S")
    db = parts[1]
    dt = parts[2]
    freq = parts[3]
    
    if (len(parts) > 8):  #If there are more than 8 parts, it takes the next three components for the message; otherwise, it takes the next two.
        message = [parts[5] , parts[6] , parts[7]]
    else:
        message = [parts[5] , parts[6]]
    alphabets = 'QWERTYUIOPASDFGHJKLZXCVBNM'   #identifying the start of the country name
    
    # parcing country name removing special characters
    if (len(parts)>8):
        if parts[8][0] in alphabets:
            country = parts[8]
        else:
            country = parts[8][1:]
    else:
        if parts[7][0] in alphabets:
            country = parts[7]
        else:
            country = parts[7][1:]

    return {
          "utc": utc,
          "freq": freq,
          "dt": dt,  # Meaning unknown
          "db": db,  # Meaning unknown
          "msg": message,
          "tower":message[1],
          "country": country,
      }
    
def daynight(utc_time):   #Defines a function that takes a UTC time string

    try:
        # Extract hours fnrom the UTC time string and convert into integer
        hours = int(utc_time[:2])  #Integer comparisons are more straightforward and less error-prone than string comparisons.

        # Daytime is generally considered to be between 6 AM and 8 PM UTC
        if 6 <= hours < 20:
            return 'day'
        else:
            return 'night'

    except ValueError:
    # Handle invalid time format where the utc_time is not a valid time format, returning False
        return False
    


In [5]:
#Initializes an empty list called garbage_lines to store lines of data that cannot be parsed correctly
garbage_lines = []

#creating a dataset dictionary
dataset_pretx = {
    'Date':[],
    'Transmission_Time':[],
    'band':[],
    'Recieving_time':[],
    'Transmitting_Station':[],
    'db':[],
    'dt':[],
    'freq':[],
    'country':[],
    'Day/Night':[],
}

#variable fn takes values from 0 to len(pretx_list) - 1
for fn in range(len(pretx_list)):    #pretx_list - list containing filenames
    
    # extracting info from fnth file in pretx_list
    fileinfo = filename_to_info(pretx_list[fn])  #This function is assumed to parse the filename and return a dictionary containing metadata extracted from the filename.
    date = fileinfo['date']     #extracts the date value from the fileinfo dictionary
    transmissiontime = fileinfo['time']
    band = fileinfo['band']
    dayornight = daynight(transmissiontime)
    
    # extracting information from each entry in fnth file in pretx_list
    with open(pretx_folder+'/'+pretx_list[fn]) as f:  #The with statement ensures that the file is properly closed after its suite finishes, even if an exception is raised.
        lineslist = f.readlines()  #Opens the current file for reading and reads all lines into a list called lineslist
        #readlines method reads all the lines in the file and returns them as a list of strings.
        
        # looping over all entries in a linelist. The variable en is the index of the current line being processed.
        for en in range(len(lineslist)):
            try:        #ensuring that the code continues to run even if an error is encountered when parsing a particular line
                line = lineslist[en]
                data = signal_parse(line)  #returns a dictionary of parsed data.
                utc = data['utc']
                frequency = data['freq']
                dt = data['dt']
                db = data['db']
                tower = data['tower']
                country = data['country']
                
                # adding info to dataset- Adds the extracted and parsed data to the corresponding lists in the dataset_pretx dictionary
                dataset_pretx['Date'].append(date)
                dataset_pretx['Transmission_Time'].append(transmissiontime)
                dataset_pretx['band'].append(band)
                dataset_pretx['Recieving_time'].append(utc)
                dataset_pretx['Transmitting_Station'].append(tower)
                dataset_pretx['db'].append(db)
                dataset_pretx['dt'].append(dt)
                dataset_pretx['freq'].append(frequency)
                dataset_pretx['country'].append(country)
                dataset_pretx['Day/Night'].append(dayornight)
            except:
                # collecting garbage lines
                garbage_lines.append(line)

In [6]:
#converting dataset dictionary to pandas dataframe
pd_dataset_pretx = pd.DataFrame(dataset_pretx)

#saving Data
pd_dataset_pretx.to_csv('cleaned_pretx_data.csv')

**Extracting file entries from postx_data**

In [7]:
postx_folder = 'pos_data'

#storing all files in pos_list
#getting all filenames
pos_list = []
for filename in os.listdir(postx_folder):  #looping to iterate through each file in the postx_folder directory and appends each filename to the pos_list
    pos_list.append(filename)
    
#filename_info is designed to extract specific pieces of information from a filename that follows a particular format.
def filename_info(name):
    infolist = name.split("_")  #split filename by _
    date = infolist[0]
    time = infolist[1] 
    band = infolist[2][0:2]
    power= infolist[3].split('W')[0]
    
    return {'date':date , 'time':time , 'band':band , 'power':power}

# getting info from file entries
def pos_signal_parse(data):
    
    parts = data.split()  #Splits the input string by whitespace
    tower = parts[0]
    band = parts[1][0:2]
    mode = parts[2]
    distance = parts[3]
    utclist = parts[5].split(':')
    utc = utclist[0] + utclist[1] + utclist[2]
    
    return {
          "tower": tower,
          "mode": mode,
          "band":band,
          "distance": distance,
          "utc":utc
      }


In [8]:
#Initializing a dictionary called pos_dataset_dict
pos_dataset_dict = {
    'Date':[],
    'Transmission_Time':[],
    'band':[],
    'utc':[],
    'power':[],
    'Recieving_Station':[],
    'Distance':[]
}
garbage = []  #for data that cannot be parsed

for fn in range(len(pos_list)):  #Loops through each file in pos_list using its index fn
    # extracting info from fnth file in pretx_list
    file = pos_list[fn]
    info = filename_info(file)
    date = info['date']
    transmissiontime = info['time']
    band = info['band']
    power = info['power']
    
    with open(postx_folder+'/'+file) as f:
        lineslist = f.readlines() #reads all lines of current file into a list called lineslist
    
    # looping over all entries in a file using its index i
        for i in range(len(lineslist)):  #For each line, reads the line and parses it using the pos_signal_parse
            try:
                line = lineslist[i]
                signal_data = pos_signal_parse(line)  #extract attributes from parsed signal data
                utc = signal_data['utc']
                tower = signal_data['tower']
                distance = signal_data['distance']
                mode = signal_data['mode']

                # adding entries in dataset
                pos_dataset_dict['Date'].append(date)
                pos_dataset_dict['Transmission_Time'].append(transmissiontime)
                pos_dataset_dict['band'].append(band)
                pos_dataset_dict['utc'].append(utc)
                pos_dataset_dict['power'].append(power)
                pos_dataset_dict['Recieving_Station'].append(tower)
                pos_dataset_dict['Distance'].append(distance)
                
            except:
                garbage.append(line)

In [9]:
# creating post dataframe
pos_dataset = pd.DataFrame(pos_dataset_dict)

#saving Data
pos_dataset.to_csv('cleaned_postx_data.csv')