In [10]:
import json

In [19]:
import boto3

In [67]:
from dataclasses import dataclass
from datetime import datetime

In [20]:
import boto3
import logging
from io import BytesIO

def read_and_parse_s3_file(bucket_name, document_id):
    """
    Read and parse a text file directly from an S3 bucket.

    Parameters:
        bucket_name (str): The name of the S3 bucket.
        document_id (str): The document ID used to construct the object key.

    Returns:
        list: A list of lines from the S3 object, or None if the read or parsing fails.
    """
    
    # Initialize a session using Amazon S3
    s3 = boto3.client('s3')
    
    # Construct the object key (this will depend on your specific use-case)
    object_key = f"{document_id}/cleaned_data.json"  # assuming it's a text file

    try:
        # Read the object into a bytes buffer
        s3_object = s3.get_object(Bucket=bucket_name, Key=object_key)
        s3_data = s3_object['Body'].read()

        # Decode bytes buffer to string
        text_content = s3_data.decode('utf-8')

        # Parse the text content into lines
        lines = text_content.split('\n')

        return lines
    except Exception as e:
        logging.error(f"An error occurred: {e}")
        return None

In [30]:
bucket_name = 'insight-articles-content'

In [31]:
document_id = '650ad82bdd0346575c9ab68e'
object_key = f"{document_id}/cleaned_data.json"

In [37]:
s3_object = s3.get_object(Bucket=bucket_name, Key=object_key)

In [38]:
k = json.loads(s3_object['Body'].read())

In [62]:
k.keys()

dict_keys(['meta_data', 'cleaned_text', 'article_images'])

In [63]:
k['meta_data'].keys()

dict_keys(['is_premium_article', 'title', 'short_description', 'published_time', 'last_updated_time', 'tags', 'image_url', 'author'])

In [45]:
k['cleaned_text'].split(' ')[-10:]

['can', 'last', 'for', 'many', 'years,', 'often', 'a', 'decade', 'or', 'more.']

In [68]:
@dataclass
class MetaData:
    is_premium_article: bool
    title: str
    short_description: str
    published_time: datetime

@dataclass
class Article:
    meta_data: MetaData
    cleaned_text: str

In [73]:
filtered_dict = {k1: v for k1, v in k['meta_data'].items() if k1 in MetaData.__annotations__}

In [74]:
meta_data = MetaData(**filtered_dict)

In [78]:
from dataclasses import dataclass, InitVar, field

In [100]:
k['meta_data'].keys()

dict_keys(['is_premium_article', 'title', 'short_description', 'published_time', 'last_updated_time', 'tags', 'image_url', 'author'])

In [111]:
filtered_dict = {k1: v for k1, v in k.items() if k1 in Article.__annotations__}

In [112]:
s = Article(**filtered_dict)

In [115]:
import re

In [117]:
a = None

In [119]:
if 'abc':
    print('d')

d


In [118]:
if not a:
    print('k')

k


In [103]:
@dataclass
class Article:
    cleaned_text: str
    meta_data: InitVar[dict]
    is_premium_article: bool | None = None
    title: str | None = None

    def __post_init__(self, meta_data):
        self.is_premium_article = meta_data.get('is_premium_article')
        self.title = meta_data.get('title')

In [50]:
[x for x in txt.split(' ') if x == '1949']

['1949']

In [52]:
[x for x in k['cleaned_text'].split(' ') if x == 1949]

[]

In [49]:
len(txt.split(' '))

2162

In [47]:
len(k['cleaned_text'].split(' '))

19591

In [33]:
s3_object

{'ResponseMetadata': {'RequestId': 'SY28RSPT1X0VE6KK',
  'HostId': 'NzGZqt5IhCLckaS8D03BnuVexw1QH+Dzgi2MInijHqdnpmgcjYzBMynCbqqn5+Go1f8BV2ikKsheGAz0q0ct1w==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'NzGZqt5IhCLckaS8D03BnuVexw1QH+Dzgi2MInijHqdnpmgcjYzBMynCbqqn5+Go1f8BV2ikKsheGAz0q0ct1w==',
   'x-amz-request-id': 'SY28RSPT1X0VE6KK',
   'date': 'Wed, 20 Sep 2023 12:33:35 GMT',
   'last-modified': 'Wed, 20 Sep 2023 11:32:06 GMT',
   'etag': '"db4e79626a4625cf3a5ed233ea216258"',
   'x-amz-server-side-encryption': 'AES256',
   'accept-ranges': 'bytes',
   'content-type': 'binary/octet-stream',
   'server': 'AmazonS3',
   'content-length': '139478'},
  'RetryAttempts': 1},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2023, 9, 20, 11, 32, 6, tzinfo=tzutc()),
 'ContentLength': 139478,
 'ETag': '"db4e79626a4625cf3a5ed233ea216258"',
 'ContentType': 'binary/octet-stream',
 'ServerSideEncryption': 'AES256',
 'Metadata': {},
 'Body': <botocore.response.StreamingBody

In [None]:
read_and_parse_s3_file(bucket_name='insight-articles-content')

In [11]:
with open('cleaned_data.json', 'rb') as f:
    k = json.load(f)

In [14]:
k['meta_data']['title']

'Ola Electric eyes IPO filing by end-October in expedited listing plan: Report | Mint'

In [18]:
k['cleaned_text'].replace('\'', ''')

SyntaxError: incomplete input (2210104687.py, line 1)

In [132]:
def get_article_data_from_s3(article_id: str):
    # TODO: - move credentials to conf/env variable
    # TODO: - move this to a class perhaps?
    bucket_name = 'insight-articles-content'
    object_key = f"{article_id}/cleaned_data.json"
    s3_object = s3.get_object(Bucket=bucket_name, Key=object_key)
    article_json = json.loads(s3_object['Body'].read())
    if validate_article_json(article_json=article_json, article_id=article_id):
        return article_json
    else:
        return None

In [124]:
article_id = '650d15327946741de369232c'

In [125]:
bucket_name = 'insight-articles-content'
object_key = f"{article_id}/cleaned_data.json"
s3_object = s3.get_object(Bucket=bucket_name, Key=object_key)

In [128]:
k = s3_object['Body'].read()

In [135]:
k1 = json.loads(k)

In [2]:
data = {"meta_data": {"is_premium_article": true, "title": "Mint Explainer: Why Trai wants to cut the entry fees for various licences | Mint", "short_description": "The telecom regulator has proposed reducing the entry fees for a number of licences and rationalising bank guarantees that service providers give the government", "published_time": "2023-09-20T16:03:19+05:30", "last_updated_time": "2023-09-20T16:03:19+05:30", "tags": ["trai", "telecom regulator", "telecom licence fees", "india telecom sector", "bank guarantees", "entry fee", "reliance jio", "bharti airtel", "vodafone idea", "bsnl", "department of telecommunications", "telecom licence"], "image_url": "https://www.livemint.com/lm-img/img/2023/09/20/600x338/APHL7BUU-koBE--621x414LiveMint_1695205738297_1695205738486.jpg", "author": "Gulveen Aulakh"}}

NameError: name 'true' is not defined

In [137]:
import requests

In [138]:
k = requests.get('http://insight-user-app-beta-env.eba-rnrpvmin.ap-south-1.elasticbeanstalk.com/article/650d15327946741de369232c')

In [139]:
k2 = json.loads(k.text)

In [141]:
k1.update(k2)

In [142]:
k1

{'meta_data': {'is_premium_article': False,
  'title': 'Coal India share price Today Live Updates : Coal India Stocks Plummet in the Market | Mint',
  'short_description': 'Coal India stock price went down today, 22 Sep 2023, by -1.37 %. The stock closed at 284.25 per share. The stock is currently trading at 280.35 per share. Investors should monitor Coal India stock price closely in the coming days and weeks to see how it reacts to the news.',
  'published_time': '2023-09-22T08:15:27+05:30',
  'last_updated_time': '2023-09-22T09:32:46+05:30',
  'tags': ['Coal India Share Price Live',
   'Coal India Stock Price Live',
   'Coal India Stock',
   'Coal India Share'],
  'image_url': 'https://www.livemint.com/lm-img/img/2023/05/04/600x338/The-Bombay-Stock-Exchange-in-Mumbai---Bloomberg-Ph_1683200468247_1683200494396.jpg',
  'author': 'Livemint'},
 'cleaned_text': "On the last day, Coal India's stock opened at₹285 and closed at₹284.25. The stock had a high of₹286.7 and a low of₹278.75. The m

In [140]:
k2

{'articleId': '650d15327946741de369232c',
 'url': 'https://www.livemint.com/market/live-blog/coal-india-share-price-live-blog-for-22-sep-2023-11695350727833.html',
 'title': 'Coal India share price Today Live Updates : Coal India Stocks Plummet in the Market | Mint',
 'shortDescription': 'Coal India stock price went down today, 22 Sep 2023, by -1.37 %. The stock closed at 284.25 per share. The stock is currently trading at 280.35 per share. Investors should monitor Coal India stock price closely in the coming days and weeks to see how it reacts to the news.',
 'publishedTime': '2023-09-22T08:15:27+05:30',
 'lastUpdatedTime': '2023-09-22T09:32:46+05:30',
 'tags': ['Coal India Share Price Live',
  'Coal India Stock Price Live',
  'Coal India Stock',
  'Coal India Share'],
 'articleImageUrl': 'https://www.livemint.com/lm-img/img/2023/05/04/600x338/The-Bombay-Stock-Exchange-in-Mumbai---Bloomberg-Ph_1683200468247_1683200494396.jpg',
 'category': None,
 'author': 'Livemint',
 'isPremiumArtic

In [143]:
@dataclass
class Article:
    article_id: str
    cleaned_text: str
    is_premium_article: bool
    title: str

    @classmethod
    def from_dict(cls, data: dict):
        key_mapping = {
            'articleId': 'article_id',
            'isPremiumArticle': 'is_premium_article',
            'title': 'title',
            'cleaned_text': 'cleaned_text'
        }
        processed_data = {key_mapping[k]: v for k, v in data.items() if k in key_mapping}

        return cls(**processed_data)

In [145]:
import numpy as np

In [153]:
ss = [np.float32(np.random.rand()) for i in range(1024)]

In [156]:
%timeit [float(x) for x in ss]

19.8 µs ± 13.1 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [144]:
Article.from_dict(k1)

Article(article_id='650d15327946741de369232c', cleaned_text="On the last day, Coal India's stock opened at₹285 and closed at₹284.25. The stock had a high of₹286.7 and a low of₹278.75. The market capitalization of Coal India is currently at₹172,772.09 crores. The 52-week high for the stock is₹288, while the 52-week low is₹207.7. The BSE volume for the stock was 369,943 shares.\nDisclaimer: This is an AI-generated live blog and has not been edited by LiveMint staff.\nThe current data of Coal India stock shows that the stock price is₹280.35. There has been a percent change of -1.37, indicating a decrease in value. The net change is -3.9, suggesting a decrease of₹3.9 in the stock price.\nThe current data of Coal India stock shows that the stock price is₹280.35. There has been a percent change of -1.37, indicating a decrease in value. The net change is -3.9, suggesting a decrease of₹3.9 in the stock price.\nOn the last day of trading forCoal Indiaon the Bombay Stock Exchange (BSE), the volu