In [1]:
import pandas as pd
import os
import hashlib

In [16]:
# Load CSV files
local_csv = pd.read_csv('local_data.csv')
colab_csv = pd.read_csv('colab_data.csv')
shager_csv = pd.read_csv('shager_data.csv')

# Concatenate all CSVs
merged_csv = pd.concat([local_csv, colab_csv, shager_csv], ignore_index=True)


In [19]:
# Drop duplicates based on unique identifiers
merged_csv = merged_csv.drop_duplicates(subset=['Channel Username', 'ID'])
merged_csv.to_csv('merged_data.csv', index=False)

In [20]:
merged_csv.isna().sum()

Channel Title          0
Channel Username       0
ID                     0
Message             5457
Date                   0
Media Path          1593
dtype: int64

In [21]:
merged_csv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12393 entries, 0 to 12392
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Channel Title     12393 non-null  object
 1   Channel Username  12393 non-null  object
 2   ID                12393 non-null  int64 
 3   Message           6936 non-null   object
 4   Date              12393 non-null  object
 5   Media Path        10800 non-null  object
dtypes: int64(1), object(5)
memory usage: 677.7+ KB


In [23]:
def calculate_hash(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

# Directories
photo_dirs = ['shager_photos', 'local_photos', 'colab_photos']
duplicates_dir = 'duplicates'
os.makedirs(duplicates_dir, exist_ok=True)

# Collect hashes and handle duplicates
photo_hashes = {}
duplicates = []

for dir in photo_dirs:
    for file in os.listdir(dir):
        file_path = os.path.join(dir, file)
        if not os.path.isfile(file_path):
            continue
        
        file_hash = calculate_hash(file_path)
        if file_hash in photo_hashes:
            # Duplicate found, handle file existence
            base_name = os.path.basename(file_path)
            duplicate_path = os.path.join(duplicates_dir, base_name)
            
            # Ensure unique name in duplicates folder
            counter = 1
            while os.path.exists(duplicate_path):
                name, ext = os.path.splitext(base_name)
                duplicate_path = os.path.join(duplicates_dir, f"{name}_{counter}{ext}")
                counter += 1
            
            # Move the duplicate
            os.rename(file_path, duplicate_path)
        else:
            photo_hashes[file_hash] = file_path

In [25]:
consolidated_dir = 'consolidated_photos'
os.makedirs(consolidated_dir, exist_ok=True)
for photo in photo_hashes.values():
    os.rename(photo, os.path.join(consolidated_dir, os.path.basename(photo)))

In [13]:
merged_csv = pd.read_csv('merged_data - merged_data.csv')

In [14]:
consolidated_dir = 'consolidated_photos'
csv_photos = set(merged_csv['Media Path'].dropna())
existing_photos = set(os.listdir(consolidated_dir))

# Identify missing photos
missing_photos = csv_photos - existing_photos
print(f"Missing photos: {missing_photos}")

Missing photos: {'@Shageronlinestore_2532.jpg', '@Shageronlinestore_1640.jpg', '@Shageronlinestore_4506.jpg', '@Shageronlinestore_295.jpg', '@Shageronlinestore_2584.jpg', '@helloomarketethiopia_4261.jpg', '@Shageronlinestore_2778.jpg', '@sinayelj_14128.jpg', '@sinayelj_13419.jpg', '@Shageronlinestore_3063.jpg', '@helloomarketethiopia_4411.jpg', '@helloomarketethiopia_4243.jpg', '@sinayelj_14402.jpg', '@Shageronlinestore_1471.jpg', '@Shageronlinestore_182.jpg', '@sinayelj_14505.jpg', '@sinayelj_14260.jpg', '@Shageronlinestore_4450.jpg', '@Shewabrand_1850.jpg', '@Shageronlinestore_1317.jpg', '@Shageronlinestore_3936.jpg', '@sinayelj_15212.jpg', '@sinayelj_14442.jpg', '@Shageronlinestore_822.jpg', '@Shageronlinestore_3382.jpg', '@helloomarketethiopia_4337.jpg', '@Shageronlinestore_3674.jpg', '@sinayelj_15136.jpg', '@sinayelj_14874.jpg', '@Shageronlinestore_4691.jpg', '@sinayelj_14129.jpg', '@Shageronlinestore_3560.jpg', '@helloomarketethiopia_4417.jpg', '@Shageronlinestore_3461.jpg', '@Sh

In [8]:
len(csv_photos)

10800

In [9]:
len(existing_photos)

21535

In [15]:
len(missing_photos)

1485

In [11]:
existing_photos

{'@nevacomputer_6701.jpg',
 '@helloomarketethiopia_4039.jpg',
 '@ethio_brand_collection_2981.jpg',
 '@nevacomputer_2592.jpg',
 '@sinayelj_10486.jpg',
 '@ethio_brand_collection_4739.jpg',
 '@nevacomputer_1021.jpg',
 '@ethio_brand_collection_1231.jpg',
 '@Shageronlinestore_1719.jpg',
 '@Shewabrand_1170.jpg',
 '@meneshayeofficial_660.jpg',
 '@nevacomputer_1504.jpg',
 '@Shageronlinestore_2630.jpg',
 '@nevacomputer_5749.jpg',
 '@sinayelj_11674.jpg',
 '@Shageronlinestore_5323.jpg',
 '@sinayelj_10378.jpg',
 '@Shageronlinestore_2830.jpg',
 '@ethio_brand_collection_4412.jpg',
 '@meneshayeofficial_384.jpg',
 '@nevacomputer_5622.jpg',
 '@Shageronlinestore_2445.jpg',
 '@Leyueqa_5526.jpg',
 '@sinayelj_13689.jpg',
 '@Shewabrand_1285.jpg',
 '@nevacomputer_1422.jpg',
 '@ethio_brand_collection_4663.jpg',
 '@ethio_brand_collection_5064.jpg',
 '@nevacomputer_2456.jpg',
 '@nevacomputer_6221.jpg',
 '@nevacomputer_6916.jpg',
 '@ethio_brand_collection_4647.jpg',
 '@sinayelj_10384.jpg',
 '@Shageronlinestore_3

In [12]:
csv_photos

{'photos\\@sinayelj_6397.jpg',
 'photos\\@sinayelj_9762.jpg',
 'photos\\@Shewabrand_1318.jpg',
 'photos/@Shageronlinestore_2831.jpg',
 'photos\\@sinayelj_8169.jpg',
 'photos\\@Shewabrand_3085.jpg',
 'photos\\@sinayelj_9376.jpg',
 'photos\\@sinayelj_14178.jpg',
 'photos\\@Shewabrand_3214.jpg',
 'photos\\@sinayelj_3489.jpg',
 'photos/@Shageronlinestore_4315.jpg',
 'photos/@Shageronlinestore_775.jpg',
 'photos\\@Shewabrand_1156.jpg',
 'photos/@Shageronlinestore_5323.jpg',
 'photos\\@Shewabrand_1633.jpg',
 'photos\\@sinayelj_15144.jpg',
 'photos\\@sinayelj_14466.jpg',
 'photos\\@sinayelj_12017.jpg',
 'photos/@Shageronlinestore_957.jpg',
 'photos\\@sinayelj_10019.jpg',
 'photos\\@sinayelj_12034.jpg',
 'photos\\@Shewabrand_1436.jpg',
 'photos\\@sinayelj_13683.jpg',
 'photos\\@Shewabrand_2562.jpg',
 'photos/@Shageronlinestore_331.jpg',
 'photos/@helloomarketethiopia_4264.jpg',
 'photos/@Shageronlinestore_762.jpg',
 'photos\\@Shewabrand_1666.jpg',
 'photos\\@sinayelj_12385.jpg',
 'photos\\@sin

In [16]:
missing_rows = merged_csv[merged_csv['Media Path'].isin(missing_photos)]
channels_to_rescrape = missing_rows['Channel Username'].unique()
print(channels_to_rescrape)

['@sinayelj' '@Shewabrand' '@helloomarketethiopia' '@Shageronlinestore']


In [17]:
final_csv = merged_csv[~merged_csv['Media Path'].isin(missing_photos)]
final_csv.to_csv('final_data.csv', index=False)