In [51]:
import pandas as pd

file_name = "June2023-June2024"
df = pd.read_csv(f'../data_stackoverflow/{file_name}.csv')

print(df.columns)

Index(['Post Link', 'Title', 'CreationDate', 'Score', 'ViewCount',
       'AnswerCount', 'CommentCount', 'FavoriteCount', 'Tags', 'Body',
       'ClosedDate', 'CodeText', 'ImageURLs'],
      dtype='object')


In [52]:
# Only keep the columns we need
# needed_columns = ["Id", "Score", "ViewCount", "Body"]
needed_columns = ["Post Link", "Score", "ViewCount", "Body", "ImageURLs"]
df = df[needed_columns]
# Change column Post Link to Id
df = df.rename(columns={"Post Link": "Id"})
df = df.rename(columns={"ImageURLs": "ImageUrl"})

# Sort by view count
# df = df.sort_values(by='Score', ascending=False)

print(df.head(5))

         Id  Score  ViewCount  \
0  78689590     -1         29   
1  78689138     -2         35   
2  78687535      0         48   
3  78687311      2         78   
4  78687130      0         62   

                                                Body  \
0  My code:\nimport pyodbc \n\nserver = 'DESKTOP-...   
1  I have tried many things and even tried moving...   
2  .output-container {\n            background: #...   
3  So, i'm doing a little project with rust and i...   
4  I'm going to create next layout and convert it...   

                                            ImageUrl  
0             ['https://i.sstatic.net/z7vbOP5n.png']  
1  ['https://i.sstatic.net/UmhZ9zHE.png', 'https:...  
2             ['https://i.sstatic.net/V0RSm2ft.png']  
3  ['https://i.sstatic.net/pz53bNkf.png', 'https:...  
4  ['https://i.sstatic.net/mBPkBxDs.jpg', 'https:...  


In [53]:
# Extract image urls from the body
import re

def extract_image_url(body):
    # Find all image tags
    image_tags = re.findall(r'<img.*?src="(.*?)"', body)
    if len(image_tags) > 0:
        # Only keep the ones with https inside
        image_tags = [tag for tag in image_tags if "https" in tag]
        return image_tags
    return None


def extract_image_url_list(image_url):
    if pd.isnull(image_url):
        return None
    # Split the string by the comma
    image_tags = image_url.replace("'", "")[1:-1].split(',')
    if len(image_tags) > 0:
        return image_tags
    return None

def extract_alt_text_image_url(body):
    # Find all image tags
    image_tags = re.findall(r'<img.*?alt="(.*?)"', body)
    # Go through the list and remove empty strings, "alt text" or "enter image description here"
    image_tags = list(map(str.strip, image_tags))
    # If the tag is empty or is "alt text" or "enter image description here", set an empty string
    image_tags = [tag if tag != "" and tag.lower() != "alt text" and tag.lower() != "nan" and tag.lower() != "enter image description here" else "" for tag in image_tags]
    if len(image_tags) > 0:
        return image_tags
    return None


# Go in the body, and replace the images with "IMAGE_{i}"
def replace_images(body):
    # Find all image tags and replace the whole tag by "<p>IMAGE_{i}</p>"
    image_tags = re.findall(r'<img.*?>', body)
    for i, tag in enumerate(image_tags):
        body = body.replace(tag, f'<p>IMAGE_{i}</p>')
    return body


def cleanup_body(body):
    # Only what is inside the contents of p tags
    body = re.findall(r'<p>(.*?)</p>', body)
    # Remove any other tags and its content
    body = [re.sub(r'<.*?>', '', x) for x in body]
    # Remove empty strings
    body = list(filter(lambda x: x != "", body))
    # Join the list of strings
    body = " ".join(body).replace('"', "'")
    return body

# Remove \n from the body
df['Body'] = df['Body'].apply(lambda x: x.replace('\n', ' '))
# df['ImageUrl'] = df['Body'].apply(extract_image_url)
df['ImageUrl'] = df['ImageUrl'].apply(extract_image_url_list)
df['AltText'] = df['Body'].apply(extract_alt_text_image_url)
df['NewBody'] = df['Body'].apply(replace_images).apply(cleanup_body)

In [54]:
# Remove the body column
df = df.drop(columns=['Body'])
df = df.dropna(subset=['ImageUrl'])

print(df.head(5))

         Id  Score  ViewCount  \
0  78689590     -1         29   
1  78689138     -2         35   
2  78687535      0         48   
3  78687311      2         78   
4  78687130      0         62   

                                            ImageUrl AltText  \
0               [https://i.sstatic.net/z7vbOP5n.png]    None   
1  [https://i.sstatic.net/UmhZ9zHE.png,  https://...    None   
2               [https://i.sstatic.net/V0RSm2ft.png]    None   
3  [https://i.sstatic.net/pz53bNkf.png,  https://...    None   
4  [https://i.sstatic.net/mBPkBxDs.jpg,  https://...    None   

                         NewBody  
0                                 
1                                 
2                                 
3                                 
4  10 20 30 40 10 20 30 40 50 60  


In [55]:
# Dump the data to a new csv
df.to_csv(f'../data_stackoverflow/{file_name}_CleanedUp.csv', index=False)

In [56]:
# Download the images
import requests
import os

def download_image(url, filename):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)
                return True
    except Exception as e:
        print(f'Failed to download {url}: {e}')
    return False

# Create a directory to store the images
# os.makedirs('../images', exist_ok=True)

amount = len(df);
# Take a random sample of images
count = 0

for index, row in df.iterrows():
    if (count == amount):
        break
    image_url = row['ImageUrl']
    for url in image_url:
        if url:
            cleaned_url = re.split("\.\w+", url.split('/')[-1])[0]
            if url.endswith('.gif'):
                print(f'{url} is a gif, skipping')
                continue
            filename = f'./images/{file_name}/{row["Id"]}_{cleaned_url}.png'
            # If the image exists, skip
            if os.path.exists(filename):
                print(f'{filename} already exists, skipping')
                continue
            if download_image(url, filename):
                print(f'Downloaded {url} to {filename}')
                count += 1
            else:
                print(f'Failed to download {url}')

./images/June2023-June2024/78689590_z7vbOP5n.png already exists, skipping
./images/June2023-June2024/78689138_UmhZ9zHE.png already exists, skipping
./images/June2023-June2024/78689138_tCbdO86y.png already exists, skipping
./images/June2023-June2024/78687535_V0RSm2ft.png already exists, skipping
./images/June2023-June2024/78687311_pz53bNkf.png already exists, skipping
./images/June2023-June2024/78687311_YjPWCwgx.png already exists, skipping
./images/June2023-June2024/78687311_gwBrQwEI.png already exists, skipping
./images/June2023-June2024/78687311_tCwuE71y.png already exists, skipping
./images/June2023-June2024/78687130_mBPkBxDs.png already exists, skipping
./images/June2023-June2024/78687130_Z4iey9vm.png already exists, skipping
./images/June2023-June2024/78686959_4h7JKXeL.png already exists, skipping
./images/June2023-June2024/78686193_ZEXBaSmS.png already exists, skipping
./images/June2023-June2024/78685634_65KblAeB.png already exists, skipping
./images/June2023-June2024/78685623_AJ

  cleaned_url = re.split("\.\w+", url.split('/')[-1])[0]


Failed to download https://i.imgur.com/kLi0VVk.png
Failed to download  https://i.imgur.com/rxykjoL.png
./images/June2023-June2024/78493161_c3DIWcgY.png already exists, skipping
./images/June2023-June2024/78492098_e8PVHbpv.png already exists, skipping
./images/June2023-June2024/78491523_itkVYDDj.png already exists, skipping
./images/June2023-June2024/78491299_3fPwoulD.png already exists, skipping
./images/June2023-June2024/78490376_AJ99ORr8.png already exists, skipping
./images/June2023-June2024/78489434_kZTyRcSb.png already exists, skipping
./images/June2023-June2024/78488998_tWAZXQyf.png already exists, skipping
./images/June2023-June2024/78488998_polhLOfg.png already exists, skipping
./images/June2023-June2024/78488998_MYkOMtpB.png already exists, skipping
./images/June2023-June2024/78488413_WxK8Jt5w.png already exists, skipping
./images/June2023-June2024/78487793_TMkrsNhJ.png already exists, skipping
./images/June2023-June2024/78487752_82V3DVcT.png already exists, skipping
./images/