In [None]:
# Import needed libraries
import pandas as pd

### This is for loading the data

# The crawl dataset we are using do not have column names or headings so we need to handle that
column_names = ['videoID', 'uploader', 'age', 'category', 'length', 'views', 'rate', 'ratings', 'comments', 'relatedIDs']

# Load YouTube data for each depth file(we have 4 depths in total starting from 0)
def load_depth(path, crawl_date, depth):
    # Initialize a list to store the valid rows
    # Some rows might have only 1 or 2 columns which is not useful for us, so we ignore them
    valid_rows = []
    # Read our dataset line by line
    with open(path, 'r') as file:
        for line in file:
            # Split the line by tab and check if it has at least 2 columns
            split_line = line.strip().split('\t')
            if len(split_line) >= 2:
                valid_rows.append(split_line)         
    depth_data = pd.DataFrame(valid_rows)
    return depth_data

# Load all depth files for a single crawl
def load_crawl(path, crawl_date):
    # Load depth file 0, 1, 2 and 3
    depth_files = [f"{path}/{i}.txt" for i in range(4)]
    depth_dframe = []
    # Loop through the total depth files and add the dataframe to depth_dframe list
    for i, depth_file in enumerate (depth_files):
        depth_dframe.append(load_depth(depth_file, crawl_date, i))
    # Now for each crawl, we combine all depths into a single dataframe
    combined_data = pd.concat(depth_dframe, axis=0).reset_index(drop = True)
    return combined_data

In [None]:
### This is for cleaning the data    
# Clean the data
def clean_data(df):
    # Since there may be more than 1 related ids, we handle it by combining the related IDs together as a single list
    combined_related_ids = []
    # Loop over each row and combine the related IDs
    for index, row in df.iterrows():
        # We select the related IDs columns (from the 10th column onward)
        related_ids = row[9:]  
        # Join the cleaned related IDs into a single string separated by commas then add to the list
        combined_related_ids.append(','.join(related_ids.astype(str)))
    
    # Add the combined relatedIDs to the DataFrame
    df['relatedIDs'] = combined_related_ids
    # Keep only the first 9 columns plus the new combined 'relatedIDs' column
    depth_data = df.iloc[:, :9].copy() 
    depth_data['relatedIDs'] = combined_related_ids 
    # Add the column names to each column
    depth_data.columns = column_names
    # Remove leading/trailing whitespace from string columns
    depth_data['uploader'] = depth_data['uploader'].str.strip()
    depth_data['category'] = depth_data['category'].str.strip()
    # Convert these columns to numeric
    numeric_columns = ['age', 'length', 'views', 'rate', 'ratings', 'comments']
    for col in numeric_columns:
        depth_data[col] = pd.to_numeric(depth_data[col])   
    # Fill missing 'rate' values with the mean of the column
    depth_data['rate'] = depth_data['rate'].fillna(depth_data['rate'].mean())
    return depth_data


In [None]:
# # Function to connect and insert into MongoDB
# def insert_to_mongo(db, collection_name, records):
#     # Insert records into MongoDB
#     db[collection_name].insert_many(records)
# 
# # Main function
# def main():
#     # For each crawl, load and process all depth files
#     crawl1 = load_crawl('data/080327', '2008-03-27')
#     crawl2 = load_crawl('data/080329', '2008-03-29')
#     crawl3 = load_crawl('data/080331', '2008-03-31')
# 
#     # Combine all crawls into a single dataframe
#     combined_data = pd.concat([crawl1, crawl2, crawl3], axis=0).reset_index(drop=True)
# 
#     # Clean the combined data
#     combined_data = clean_data(combined_data)
# 
#     # Transform the combined data
#     combined_data = transform_data(combined_data)
# 
#     print(combined_data.head())
# main()

In [None]:
testing = (load_depth("data/080331/2.txt", '2008-03-31', "2"))
clean = clean_data(testing)
print(clean)
