# Testing APIs

#### Config

In [60]:
import configparser

# Load configuration
config = configparser.RawConfigParser()
config.read('../config/config.ini')

# Twitter API credentials
API_KEY = config['TWITTER_API']['API_KEY']
API_SECRET = config['TWITTER_API']['API_SECRET']
ACCESS_TOKEN = config['TWITTER_API']['ACCESS_TOKEN']
ACCESS_SECRET = config['TWITTER_API']['ACCESS_SECRET']
BEARER_TOKEN = config['TWITTER_API']['BEARER_TOKEN']

# Azure Event Hub Kafka settings
KAFKA_BROKER = config['AZURE_EVENTHUB']['BROKER']
EVENT_HUB_NAME = config['AZURE_EVENTHUB']['EVENT_HUB_NAME']
KAFKA_SASL_USERNAME = config['AZURE_EVENTHUB']['SASL_USERNAME']
KAFKA_SASL_PASSWORD = config['AZURE_EVENTHUB']['SASL_PASSWORD']

# Reddit API Settings
REDDIT_CLIENT_ID = config['REDDIT_API']['CLIENT_ID']
REDDIT_CLIENT_SECRET = config['REDDIT_API']['CLIENT_SECRET']
REDDIT_USER_AGENT = config['REDDIT_API']['USER_AGENT']
REDDIT_USERNAME = config['REDDIT_API']['USERNAME']
REDDIT_PASSWORD = config['REDDIT_API']['PASSWORD']

print("Config loaded successfully!")

Config loaded successfully!


#### Testing Tweepy

In [61]:
import tweepy
import pandas as pd
import time

In [47]:
auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True)

# Test authentication
try:
    api.verify_credentials()
    print("Authentication successful!")
except Exception as e:
    print(f"Error: {e}")


Authentication successful!


In [48]:
client = tweepy.Client(bearer_token=BEARER_TOKEN, consumer_key=API_KEY, consumer_secret=API_SECRET, access_token=ACCESS_TOKEN, access_token_secret=ACCESS_SECRET)
auth = tweepy.OAuth2BearerHandler(BEARER_TOKEN)
api = tweepy.API(auth)
# Search for recent tweets
query = "fashion -is:retweet lang:en"  # Keywords + filter for retweets
response = client.search_recent_tweets(query=query, max_results=10, tweet_fields=["created_at", "text"])

# Parse and store data
data = []
if response.data:
    for tweet in response.data:
        data.append({
            "id": tweet.id,
            "created_at": tweet.created_at,
            "text": tweet.text
        })

# Convert to DataFrame
df = pd.DataFrame(data)
print(df)


                    id                created_at  \
0  1869087265127125142 2024-12-17 18:28:24+00:00   
1  1869087235427172369 2024-12-17 18:28:17+00:00   
2  1869087207174328518 2024-12-17 18:28:10+00:00   
3  1869087182184689829 2024-12-17 18:28:04+00:00   
4  1869087157920600371 2024-12-17 18:27:59+00:00   
5  1869087150886867395 2024-12-17 18:27:57+00:00   
6  1869087145358712856 2024-12-17 18:27:56+00:00   
7  1869087143500648723 2024-12-17 18:27:55+00:00   
8  1869087122965381609 2024-12-17 18:27:50+00:00   
9  1869087119135895917 2024-12-17 18:27:49+00:00   

                                                text  
0  Checkout on this shop on Shopee! Fashion  http...  
1  @Spread_Star @voguebusiness @Khulood_Almani @S...  
2  I paid for express shipping, and Fashion Nova ...  
3  @Ashy_slashee Hmmm... 🤔\n\nIt's a tough one th...  
4  🌟HOT HOT earn more with our crypto pump! $BTC ...  
5  they're stronger than me because if i had the ...  
6  i keep a copy of K&amp;R C in russian a

In [49]:
df['text'][0]

'Checkout on this shop on Shopee! Fashion  https://t.co/XLTOcf7ygW: https://t.co/sygAl8rFQD'

#### Testing Reddit API

In [87]:
import praw
import pandas as pd
import re
# Reddit API credentials
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=REDDIT_USER_AGENT,
    redirect_uri="http://localhost",
)

In [71]:
# Subreddit to analyze
subreddit = reddit.subreddit("malefashionadvice")

# Fetch posts from 'hot', 'new', or 'top'
posts = []
for post in subreddit.hot(limit=100):  # Change limit as needed
    posts.append({
        "title": post.title,
        "score": post.score,
        "num_comments": post.num_comments,
        "created_utc": post.created_utc,
        "selftext": post.selftext
    })

# Convert to DataFrame
df = pd.DataFrame(posts)
df.head()

Unnamed: 0,title,score,num_comments,created_utc,selftext
0,WAYWT (What Are You Wearing This Week?) - 23 O...,6,23,1729688000.0,WAYWT = What Are You Wearing This Week (or a d...
1,➡️ Daily Simple Questions ⬅️- Style feedback a...,1,4,1734455000.0,Welcome to the Daily Questions thread for all ...
2,I've held my tongue as long as I can.,405,125,1734420000.0,Some of the questions that get asked here are ...
3,Does anyone know what this suit jacket style i...,2,0,1734460000.0,[https://imgur.com/a/9SDo8Jw](https://imgur.co...
4,Tom Ford Oud Wood - 1.7oz,2,0,1734460000.0,Tom Ford Oud Wood - 1.7oz for $110 from Costco...


In [77]:
df['selftext'][99]

'Looking to get my first proper (ie. not “smart”) watch for nicer outfits. Being both a Swiss design and rail travel aficionado, a Mondaine Swiss Rail clock watch seems perfect for my taste. Any opinions on them? Good quality for the price? Long lasting?'

#### Images in Reddit

In [93]:
def fetch_reddit_posts(subreddit_name, num_posts=20, num_comments=3):
    """
    Fetch posts from a subreddit, including images, galleries, subreddit section, and top comments.

    :param subreddit_name: Name of the subreddit to fetch data from.
    :param num_posts: Number of posts to fetch.
    :param num_comments: Number of top comments to fetch per post.
    :return: DataFrame with Reddit post details (title, content, images, gallery, URL, section, top comments).
    """
    subreddit = reddit.subreddit(subreddit_name)
    posts = []

    # Updated regex to include jpeg, jpg, png, gif, imgur links
    image_regex = r"(https?://(?:i\.)?imgur\.com/[a-zA-Z0-9]+(?:\.jpg|\.jpeg|\.png|\.gif)?|" \
                  r"https?://.*\.(jpg|jpeg|png|gif))"

    # Fetch posts from the 'hot' section
    for submission in subreddit.hot(limit=num_posts):
        post_images = []

        # Extract image URLs from the post URL
        if submission.url:
            if re.search(image_regex, submission.url):
                post_images.append(submission.url)

        # Extract image links from the post's selftext
        if submission.selftext:
            images_in_text = re.findall(image_regex, submission.selftext)
            post_images.extend([img[0] for img in images_in_text])

        # Extract gallery images if the post contains a gallery
        if hasattr(submission, "gallery_data") and submission.gallery_data:
            media_metadata = submission.media_metadata
            for item in submission.gallery_data["items"]:
                media_id = item["media_id"]
                if media_id in media_metadata:
                    img_url = media_metadata[media_id]["s"]["u"]
                    post_images.append(img_url)

        # Remove duplicates in image links
        post_images = list(set(post_images))

        # Fetch top comments
        top_comments = []
        submission.comment_sort = "best"
        submission.comments.replace_more(limit=0)  # Remove "more comments" placeholders
        for comment in submission.comments[:num_comments]:
            if comment.body:
                top_comments.append(comment.body.strip())

        # Store post details
        post_data = {
            "title": submission.title,
            "content": submission.selftext,
            "images": ", ".join(post_images) if post_images else None,
            "url": submission.url,
            "section": "hot",
            "top_comments": " | ".join(top_comments) if top_comments else None
        }
        posts.append(post_data)

    # Convert to a DataFrame
    df = pd.DataFrame(posts)
    return df


In [120]:
subreddit_name = "femalefashionadvice"
num_posts = 20
num_comments = 3

# Call the updated function
reddit_df = fetch_reddit_posts(subreddit_name, num_posts, num_comments)
print(f"Fetched {len(reddit_df)} posts from r/{subreddit_name} (hot section).")

# Display the DataFrame
reddit_df.head()


  submission.comment_sort = "best"


Fetched 20 posts from r/femalefashionadvice (hot section).


Unnamed: 0,title,content,images,url,section,top_comments
0,"Fashion news - December 23, 2024",Here you can share all the fashion related new...,,https://www.reddit.com/r/femalefashionadvice/c...,hot,
1,Weird Little Outfits: An Inspo Album,,https://preview.redd.it/9fdl3z112j9e1.jpg?widt...,https://www.reddit.com/gallery/1hnz49h,hot,Oh this is so so fun. I think I need that yell...
2,Are there any fashion brands you love right no...,sometimes a brand just feels right. i’m curiou...,,https://www.reddit.com/r/femalefashionadvice/c...,hot,[Sea](https://sea-ny.com/pages/pre-fall?_sc=1)...
3,"Daily Questions Thread December 28, 2024",This thread is for individual style questions...,,https://www.reddit.com/r/femalefashionadvice/c...,hot,Hello! I’m looking into a Max Mara coat (weeke...
4,Wearing the clothes vs the clothes wearing you,Hi guys! \n\nWhat are your thoughts on styling...,,https://www.reddit.com/r/femalefashionadvice/c...,hot,I wear a lot of stuff that could be described ...


In [110]:
reddit_df['top_comments'][1]

'https://preview.redd.it/35gpak26go9e1.png?width=1170&format=png&auto=webp&s=0a539a6d8d60246c0636e97e607a982e82ea4f6f\n\nFound a Harris Tweed Balmacaan for $150. This is the tag attached. Seems to be a vintage(?) Sears tag. Does anyone have any idea if this line of clothes are good quality or not?\n\nIt comes with the Harris tweed seal. | who makes the best elevated mens basics nowadays? I\'m trying to buy higher quality garments and looking for a slightly elevated uniqlo. \n\nIt seems to me that many of the brands in this category start specializing but don\'t have the breadth that uniqlo has... (ie, colorful standard for crewnecks, A&F for jeans, ec) | Hello! I am looking for something that has the oversized sweater feel, but issue is that I am a tall (6\'3") broad-shouldered person already. So in the past when I buy sweaters hoping they\'ll fit me in this way they end up usually too short in the body when I\'m actively looking for it to be too long in the body.\n\n\nAny brands you k

Add the image processing

In [123]:
import os
import requests
from PIL import Image
import torch.nn.functional as F
from torchvision import models, transforms
import torch
import pandas as pd

# Define the transform for preprocessing

transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize image to match model input size

    transforms.ToTensor(),          # Convert image to tensor

    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],  # Normalization mean (ImageNet values)

        std=[0.229, 0.224, 0.225]    # Normalization std (ImageNet values)

    )
])


# Assume the model is already loaded
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet18(pretrained=False)
model.fc = torch.nn.Linear(model.fc.in_features, 45)  # Adjust for the number of categories
model.load_state_dict(torch.load("../ImageProcessingTool/notebooks/resnet18_fashion.pth", map_location=device))
model = model.to(device)
model.eval()  # Set to evaluation mode


# Preprocessing function (assume it's defined)
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")  # Ensure 3-channel RGB
    image = transform(image)
    return image.unsqueeze(0)  # Add batch dimension

# Prediction function
def predict_with_confidence(image_path, model, categories):
    # Preprocess the image
    image = preprocess_image(image_path).to(device)
    
    # Perform inference
    with torch.no_grad():
        outputs = model(image)
        probabilities = F.softmax(outputs, dim=1)  # Convert logits to probabilities
        confidence, predicted = torch.max(probabilities, 1)  # Get max confidence and class index
    
    return categories[predicted.item()], confidence.item()

# Helper function to download an image from a URL
def download_image(url, save_dir="images"):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        filename = os.path.join(save_dir, url.split("/")[-1])
        with open(filename, "wb") as file:
            file.write(response.content)
        return filename
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return None

# Categories for prediction
categories = ['Topwear', 'Flip Flops', 'Water Bottle', 'Bath and Body', 'Shoes', 'Watches', 'Lips', 'Scarves', 
              'Innerwear', 'Wristbands', 'Skin Care', 'Ties', 'Umbrellas', 'Headwear', 'Loungewear and Nightwear', 
              'Beauty Accessories', 'Eyewear', 'Mufflers', 'Free Gifts', 'Nails', 'Bottomwear', 'Wallets', 'Saree', 
              'Dress', 'Cufflinks', 'Home Furnishing', 'Vouchers', 'Jewellery', 'Apparel Set', 'Perfumes', 'Makeup', 
              'Belts', 'Fragrance', 'Skin', 'Sports Accessories', 'Socks', 'Bags', 'Eyes', 'Shoe Accessories', 
              'Sandal', 'Gloves', 'Stoles', 'Accessories', 'Hair', 'Sports Equipment']

# Process images in the DataFrame
def process_images_in_dataframe(df, image_column="images"):
    processed_results = []

    for index, row in df.iterrows():
        image_urls = row[image_column]
        if pd.isna(image_urls):
            processed_results.append(None)
            continue

        predictions = []
        for url in image_urls.split(", "):  # Handle multiple image URLs
            image_path = download_image(url)
            if image_path:
                try:
                    predicted_category, confidence = predict_with_confidence(image_path, model, categories)
                    predictions.append(f"{predicted_category} ({confidence:.2f})")
                except Exception as e:
                    print(f"Error processing image {image_path}: {e}")
            else:
                predictions.append(None)

        # Combine predictions for all images in the row
        processed_results.append(", ".join(filter(None, predictions)))

    # Add the predictions to a new column in the DataFrame
    df["images_processed"] = processed_results
    return df


  model.load_state_dict(torch.load("../ImageProcessingTool/notebooks/resnet18_fashion.pth", map_location=device))


In [124]:
# Assuming `reddit_df` is your DataFrame with the 'images' column
reddit_df_with_predictions = process_images_in_dataframe(reddit_df, image_column="images")

# Display the updated DataFrame
reddit_df_with_predictions.head()

Unnamed: 0,title,content,images,url,section,top_comments,images_processed
0,"Fashion news - December 23, 2024",Here you can share all the fashion related new...,,https://www.reddit.com/r/femalefashionadvice/c...,hot,,
1,Weird Little Outfits: An Inspo Album,,https://preview.redd.it/9fdl3z112j9e1.jpg?widt...,https://www.reddit.com/gallery/1hnz49h,hot,Oh this is so so fun. I think I need that yell...,"Loungewear and Nightwear (0.59), Loungewear an..."
2,Are there any fashion brands you love right no...,sometimes a brand just feels right. i’m curiou...,,https://www.reddit.com/r/femalefashionadvice/c...,hot,[Sea](https://sea-ny.com/pages/pre-fall?_sc=1)...,
3,"Daily Questions Thread December 28, 2024",This thread is for individual style questions...,,https://www.reddit.com/r/femalefashionadvice/c...,hot,Hello! I’m looking into a Max Mara coat (weeke...,
4,Wearing the clothes vs the clothes wearing you,Hi guys! \n\nWhat are your thoughts on styling...,,https://www.reddit.com/r/femalefashionadvice/c...,hot,I wear a lot of stuff that could be described ...,


In [125]:
reddit_df.to_csv('reddit_data.csv', index=False)
reddit_df_with_predictions.to_csv('reddit_data_with_predictions.csv', index=False)