In [18]:
# Import needed libraries
import pandas as pd

### This is for loading the data

# The crawl dataset we are using do not have column names or headings so we need to handle that
column_names = ['videoID', 'uploader', 'age', 'category', 'length', 'views', 'rate', 'ratings', 'comments', 'relatedIDs']

# Load YouTube data for each depth file(we have 4 depths in total starting from 0)
def load_depth(path, crawl_date, depth):
    # The separator of the dataset we are using is a tab-separated
    depth_data = pd.read_csv(path, sep='\t', header = None, on_bad_lines = 'skip', low_memory = False)
    # Add the crawl date and depth number columns
    depth_data['crawl_date'] = crawl_date
    depth_data['depth'] = depth
    return depth_data

# Load all depth files for a single crawl
def load_crawl(path, crawl_date):
    # Load depth file 0, 1, 2 and 3
    depth_files = [f"{path}/{i}.txt" for i in range(4)]
    depth_dframe = []
    # Loop through the total depth files and add the dataframe to depth_dframe list
    for i, depth_file in enumerate (depth_files):
        depth_dframe.append(load_depth(depth_file, crawl_date, i))
    # Now for each crawl, we combine all depths into a single dataframe
    combined_data = pd.concat(depth_dframe, axis=0).reset_index(drop = True)
    return combined_data

In [19]:
# Testing
df = load_depth("data/080327/0.txt", '2008-03-27', "0")
print(df)
# df = load_crawl("data/080327", "2008-03-27")
# print(df.head())

               0                  1       2                 3      4  \
0    gFa1YMEJFag            sxephil  1135.0     Entertainment  270.0   
1    pSJ4hv28zaI  thecomputernerd01  1136.0            Comedy  216.0   
2    uHVEDq6RVXc    barelypolitical  1134.0   News & Politics   56.0   
3    K7Om0QZy-38          SouljaBoy  1134.0             Music  185.0   
4    DCAO6bZa31o    AssociatedPress  1134.0   News & Politics   45.0   
..           ...                ...     ...               ...    ...   
249  TLxiK5K2A7w       bosnaqiirlx3  1136.0            Comedy  252.0   
250  hyswDZpNar8           beki0555  1136.0    Pets & Animals   24.0   
251  fAffvIg2w1U       wavesg00dbye  1136.0             Music   34.0   
252  sUYz2--i_Hw             ice500  1136.0     Entertainment   78.0   
253  91gEbvB9U_M         corentin51  1136.0  Autos & Vehicles    4.0   

            5     6       7       8            9  ...           21  \
0    101384.0  4.72  3407.0  2887.0  QuRYeRnAuXM  ...  I4yKEK9o8g

In [21]:
### This is for cleaning the data
# def clean_data(df):
#     # We remove any leading/trailing whitespace from string columns
#     df['uploader'] = df['uploader'].str.strip()
#     df['category'] = df['category'].str.strip()
    # Convert 'age' to integer
    
# Clean the data
def clean_data(df):
    # Since there may be more than 1 related ids, we handle it by combining the related IDs together as a single list
    combined_related_ids = []
    # Loop over each row and combine the related IDs
    for index, row in df.iterrows():
        # We select the related IDs columns (from the 10th column onward)
        related_ids = row[9:]  
        # Join the cleaned related IDs into a single string separated by commas then add to the list
        combined_related_ids.append(','.join(related_ids.astype(str)))
    
    # Add the combined relatedIDs to the DataFrame
    df['relatedIDs'] = combined_related_ids
    # Keep only the first 9 columns plus the new combined 'relatedIDs' column
    depth_data = df.iloc[:, :9].copy() 
    depth_data['relatedIDs'] = combined_related_ids 
    # Add the column names to each column
    depth_data.columns = column_names
    
    # Remove leading/trailing whitespace from string columns
    depth_data['uploader'] = depth_data['uploader'].str.strip()
    depth_data['category'] = depth_data['category'].str.strip()

    # Convert 'age' to integer (handle errors with 'coerce')
    depth_data['age'] = pd.to_numeric(depth_data['age'], errors='coerce')

    # Fill missing 'rate' values with the mean of the column
    depth_data['rate'] = depth_data['rate'].fillna(depth_data['rate'].mean())
    
    
    # 4. Convert 'related_ids' to list format (split by whitespace)
    # df['related_ids'] = df['related_ids'].str.split()

    return depth_data


# # Transform the data
# def transform_data(df):
#     # 1. Normalize the 'length' column to ensure it's integer
#     df['length'] = df['length'].astype(int)
# 
#     # 2. Create a new column 'views_per_rating' to avoid division by zero
#     df['views_per_rating'] = df['views'] / (df['ratings'] + 1)
# 
#     # 3. Convert 'category' to categorical type for efficiency
#     df['category'] = df['category'].astype('category')
# 
#     return df

In [None]:
# # Function to connect and insert into MongoDB
# def insert_to_mongo(db, collection_name, records):
#     # Insert records into MongoDB
#     db[collection_name].insert_many(records)
# 
# # Main function
# def main():
#     # For each crawl, load and process all depth files
#     crawl1 = load_crawl('data/080327', '2008-03-27')
#     crawl2 = load_crawl('data/080329', '2008-03-29')
#     crawl3 = load_crawl('data/080331', '2008-03-31')
# 
#     # Combine all crawls into a single dataframe
#     combined_data = pd.concat([crawl1, crawl2, crawl3], axis=0).reset_index(drop=True)
# 
#     # Clean the combined data
#     combined_data = clean_data(combined_data)
# 
#     # Transform the combined data
#     combined_data = transform_data(combined_data)
# 
#     print(combined_data.head())
# main()

In [23]:
testing = clean_data(load_depth("data/080327/0.txt", '2008-03-27', "0"))
print(testing.head())

       videoID           uploader     age         category  length     views  \
0  gFa1YMEJFag            sxephil  1135.0    Entertainment   270.0  101384.0   
1  pSJ4hv28zaI  thecomputernerd01  1136.0           Comedy   216.0     458.0   
2  uHVEDq6RVXc    barelypolitical  1134.0  News & Politics    56.0  555203.0   
3  K7Om0QZy-38          SouljaBoy  1134.0            Music   185.0   91293.0   
4  DCAO6bZa31o    AssociatedPress  1134.0  News & Politics    45.0  108095.0   

   rate  ratings  comments                                         relatedIDs  
0  4.72   3407.0    2887.0  QuRYeRnAuXM,3TYqkBJ9YRk,rSJ8QZWBegU,nRcovJn9xH...  
1  4.80    133.0    2183.0  dh6dF1XY3uI,_a0gQFOJYWM,UzPldH0vuHY,h9gRdAmGFn...  
2  4.70   3574.0    2117.0  aYHBqH_xbCw,SfaxA9Q-9AQ,1cWWE3A2mDI,exT_E9FNu8...  
3  3.19   1063.0    1132.0  UCeA4K2-wNk,BDmhe0vIFiQ,9xSvVPa41Cg,3Cc7-4OeAg...  
4  3.58    264.0    1069.0  5vLbA7n8EG0,3ZbXJp-NUZc,McYsnvAymV8,MaE1kowuCB...  
