In [249]:
# Import needed libraries
import pandas as pd

### This is for loading the data

# The crawl dataset we are using do not have column names or headings so we need to handle that
column_names = ['videoID', 'uploader', 'age', 'category', 'length', 'views', 'rate', 'ratings', 'comments', 'relatedIDs']

# Load YouTube data for each depth file(we have 4 depths in total starting from 0)
def load_depth(path, crawl_date, depth):
    # Initialize a list to store the valid rows
    # Some rows might have only 1 or 2 columns which is not useful for us, so we ignore them
    valid_rows = []
    # Read our dataset line by line
    with open(path, 'r') as file:
        for line in file:
            # Split the line by tab and check if it has at least 2 columns
            split_line = line.strip().split('\t')
            if len(split_line) >= 2:
                valid_rows.append(split_line)       
    # Convert the list into
    depth_data = pd.DataFrame(valid_rows)
    depth_data['crawl_date'] = crawl_date
    depth_data['depth'] = depth
    return depth_data

# Load all depth files for a single crawl
def load_crawl(path, crawl_date):
    # Load depth file 0, 1, 2 and 3
    depth_files = [f"{path}/{i}.txt" for i in range(4)]
    depth_dframe = []
    # Loop through the total depth files and add the dataframe to depth_dframe list
    for i, depth_file in enumerate (depth_files):
        depth_dframe.append(load_depth(depth_file, crawl_date, i))
    # Now for each crawl, we combine all depths into a single dataframe
    combined_data = pd.concat(depth_dframe, axis=0).reset_index(drop = True)
    return combined_data

In [250]:
### This is for cleaning and transformation the data (Data Preparation)
def prepare_data(df):
    # Since there may be more than 1 related ids, we handle it by combining the related IDs together as a single list
    combined_related_ids = []
    # Loop over each row and combine the related IDs
    for index, row in df.iterrows():
        # We select the related IDs columns (from the 10th column onward) 
        # The dataset description says the related IDs is up to 20 strings only 
        related_ids = row[9:29]  
        # Join the cleaned related IDs into a single string separated by commas then add to the list
        combined_related_ids.append(','.join(related_ids.astype(str)))
    # Add the combined relatedIDs to the DataFrame
    df['relatedIDs'] = combined_related_ids
    # Keep only the first 9 columns plus the new combined 'relatedIDs' column
    depth_data = df.iloc[:, :9].copy() 
    depth_data['relatedIDs'] = combined_related_ids 
    # Keep the crawl_date and depth column 
    depth_data[['crawl_date', 'depth']] = df[['crawl_date', 'depth']] 
    # Add the column names to each column
    depth_data.columns = column_names + ['crawl_date', 'depth']
    # Remove the leading/trailing whitespace from string columns
    depth_data['uploader'] = depth_data['uploader'].str.strip()
    depth_data['category'] = depth_data['category'].str.strip()
    # Convert these columns to numeric
    numeric_columns = ['age', 'length', 'views', 'rate', 'ratings', 'comments']
    for col in numeric_columns:
        depth_data[col] = pd.to_numeric(depth_data[col])   
    # Fill in the missing 'rate' values with the mean of the column
    depth_data['rate'] = depth_data['rate'].fillna(depth_data['rate'].mean())
    return depth_data

In [251]:
# testing = (load_depth("data/080331/2.txt", '2008-03-31', "2"))
# clean = clean_data(testing)
# print(clean)

# Load all the crawls in the given dataset
crawl1 = load_crawl('data/080327', '2008-03-27')
crawl2 = load_crawl('data/080329', '2008-03-29')
crawl3 = load_crawl('data/080331', '2008-03-31')
# Combine all crawls into a single dataframe
combined_data = pd.concat([crawl1, crawl2, crawl3], axis=0).reset_index(drop=True)
# Prepare the data (clean & transform)
combined_data = prepare_data(combined_data)
print(combined_data.head())

       videoID           uploader   age         category  length   views  \
0  gFa1YMEJFag            sxephil  1135    Entertainment     270  101384   
1  pSJ4hv28zaI  thecomputernerd01  1136           Comedy     216     458   
2  uHVEDq6RVXc    barelypolitical  1134  News & Politics      56  555203   
3  K7Om0QZy-38          SouljaBoy  1134            Music     185   91293   
4  DCAO6bZa31o    AssociatedPress  1134  News & Politics      45  108095   

   rate  ratings  comments                                         relatedIDs  \
0  4.72     3407      2887  QuRYeRnAuXM,3TYqkBJ9YRk,rSJ8QZWBegU,nRcovJn9xH...   
1  4.80      133      2183  dh6dF1XY3uI,_a0gQFOJYWM,UzPldH0vuHY,h9gRdAmGFn...   
2  4.70     3574      2117  aYHBqH_xbCw,SfaxA9Q-9AQ,1cWWE3A2mDI,exT_E9FNu8...   
3  3.19     1063      1132  UCeA4K2-wNk,BDmhe0vIFiQ,9xSvVPa41Cg,3Cc7-4OeAg...   
4  3.58      264      1069  5vLbA7n8EG0,3ZbXJp-NUZc,McYsnvAymV8,MaE1kowuCB...   

   crawl_date  depth  
0  2008-03-27      0  
1  2008-03

In [252]:
# # Function to connect and insert into MongoDB
# def insert_to_mongo(db, collection_name, records):
#     # Insert records into MongoDB
#     db[collection_name].insert_many(records)
# 
# # Main function
# def main():
#     # For each crawl, load and process all depth files
#     crawl1 = load_crawl('data/080327', '2008-03-27')
#     crawl2 = load_crawl('data/080329', '2008-03-29')
#     crawl3 = load_crawl('data/080331', '2008-03-31')
# 
#     # Combine all crawls into a single dataframe
#     combined_data = pd.concat([crawl1, crawl2, crawl3], axis=0).reset_index(drop=True)
# 
#     # Clean the combined data
#     combined_data = clean_data(combined_data)
# 
#     # Transform the combined data
#     combined_data = transform_data(combined_data)
# 
#     print(combined_data.head())
# main()