In [76]:
import pandas as pd

file_name = "Filtered_Matching_IDs.csv"
df = pd.read_csv(f'/home/wfd/Desktop/York/notebooks/Outdated/Data_preprocessing/filtered_matching_ids.csv')

print(df.columns)

Index(['Post Link', 'Title', 'CreationDate', 'Score', 'ViewCount',
       'AnswerCount', 'CommentCount', 'FavoriteCount', 'Tags', 'Body',
       'ClosedDate', 'CodeText', 'ImageURLs'],
      dtype='object')


In [77]:
# Only keep the columns we need
needed_columns = ["Post Link", "Title", "Body", "ImageURLs"]
df = df[needed_columns]

# Rename columns
df = df.rename(columns={"Post Link": "Id"})

In [78]:
import re

def extract_image_url(img_list):
    if pd.isna(img_list):
        return []
    img_list = img_list.replace("[", "").replace("]", "").replace("'", "").split(',')
    img_list = [re.search(r'(https?://\S+)', img).group(0) for img in img_list]
    return img_list


# Remove \n from the body
df['Body'] = df['Body'].apply(lambda x: x.replace('\n', ' '))
df['ImageURLs'] = df['ImageURLs'].apply(extract_image_url)

In [79]:

print(df.head(5))

         Id                                              Title  \
0  78689138       Hosting A native PHP MySQL Project on vercel   
1  78678692  Reading UTF-8 texts in PowerPoint via VBA, for...   
2  78674751  Typescript interface disjunction gives incorre...   
3  78673647  Http-only cookie in response header from backe...   
4  78672889  Qt Creator design tab is greyed out in QML pro...   

                                                Body  \
0  I have tried many things and even tried moving...   
1  I want to read all text in a PowerPoint file u...   
2  I have this function export async function get...   
3  I have a fullstack app made up of a React fron...   
4  I don't know if this is the expected behavior....   

                                           ImageURLs  
0  [https://i.sstatic.net/UmhZ9zHE.png, https://i...  
1               [https://i.sstatic.net/9nF6vUFK.png]  
2               [https://i.sstatic.net/9QE26tpK.png]  
3  [https://i.sstatic.net/lXkm8i9F.png, https:

In [80]:
# Dump the data to a new csv
df.to_csv(f'../../data_stackoverflow/{file_name}_CleanedUp.csv', index=False)

In [81]:
# Download the images
import requests
import os

def download_image(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
            return True
    else:
        print(f'Failed to download {url}, status code {response.status_code}')
    return False

# Create a directory to store the images
os.makedirs('../images', exist_ok=True)

amount = len(df);
# Take a random sample of images
df = df.sample(amount)
count = 0

for index, row in df.iterrows():
    if (count == amount):
        break
    image_url = row['ImageURLs']
    for url in image_url:
        if url:
            cleaned_url = re.split("\.\w+", url.split('/')[-1])[0]
            if url.endswith('.gif'):
                print(f'{url} is a gif, skipping')
                continue
            filename = f'../images/{row["Id"]}_{cleaned_url}.png'
            # If the image exists, skip
            if os.path.exists(filename):
                print(f'{filename} already exists, skipping')
                continue
            print(f'Downloading {url} to {filename}')
            if download_image(url, filename):
                print(f'Downloaded {url} to {filename}')
                count += 1
            else:
                print(f'Failed to download {url}')

Downloading https://i.sstatic.net/Okjpu.png to ../images/76803209_Okjpu.png
Downloaded https://i.sstatic.net/Okjpu.png to ../images/76803209_Okjpu.png
../images/78673647_lXkm8i9F.png already exists, skipping
../images/78673647_lGcu2ov9.png already exists, skipping
../images/77424574_okivq.png already exists, skipping
Downloading https://i.sstatic.net/mUmrx.png to ../images/77243382_mUmrx.png


  cleaned_url = re.split("\.\w+", url.split('/')[-1])[0]


Downloaded https://i.sstatic.net/mUmrx.png to ../images/77243382_mUmrx.png
../images/78261167_TQjP4.png already exists, skipping
../images/76866681_cdiLJ.png already exists, skipping
../images/78062372_BgpwP.png already exists, skipping
../images/78062372_47Kea.png already exists, skipping
Downloading https://i.sstatic.net/obXa0.png to ../images/78313521_obXa0.png
Downloaded https://i.sstatic.net/obXa0.png to ../images/78313521_obXa0.png
../images/78072195_PCaW8.png already exists, skipping
Downloading https://i.sstatic.net/h73CO.jpg to ../images/76989815_h73CO.png
Downloaded https://i.sstatic.net/h73CO.jpg to ../images/76989815_h73CO.png
Downloading https://i.sstatic.net/PjBIt.jpg to ../images/76989815_PjBIt.png
Downloaded https://i.sstatic.net/PjBIt.jpg to ../images/76989815_PjBIt.png
Downloading https://i.sstatic.net/b8udQ.jpg to ../images/76989815_b8udQ.png
Downloaded https://i.sstatic.net/b8udQ.jpg to ../images/76989815_b8udQ.png
../images/78340160_ZPBKR.png already exists, skippi