In [145]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

In [2]:
!pip install langdetect
from langdetect import detect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=6bbb9b7f4a23b9e15cf8505d1e46419928c9916e43ccd5afdb9f498d5df161bb
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [127]:
!pip install openai
from openai import OpenAI
import openai



In [149]:
class Dataset:
    def __init__(self,df_path,verbose = False):
        self.df = pd.read_csv(df_path)
        self.verbose = verbose
    
    def group_three_words(self):
        def three_words(text):
            return text.split()[0:3]
        self.df['Text'] = self.df['Text'].apply(lambda x:' '.join(x.splitlines()))
        first_three_df = self.df
        if(self.verbose):
            print("Grouping the 1st three words in each text.......")
        first_three_df['first_three_words'] = first_three_df['Text'].apply(three_words)
        first_three_df = self.df.loc[lambda first_three_df:first_three_df['Text']!='']
        first_three_df.drop_duplicates(subset= 'first_three_words',keep='first',inplace=True)
        if(self.verbose):
            print("Done with Grouping the 1st three words in each text!!!")
        return first_three_df
        
        
    def extract_eng_text(self,df):
        def detectLang(text):
            try:
                lang = detect(text)
                return lang == 'en' 
            except:
                return False
        if(self.verbose):
            print("Checking for english text........")
        df['english'] = df['Text'].apply(detectLang)
        df = df[df['english']]
        if(self.verbose):
            print("None English text removed!!!!")
        return df[['time','Text','Link']]
    
    def group_hour_data(self):
        def remove_urls(text):
            url_pattern = r'https?://\S+|www\.\S+'
            return re.sub(url_pattern, '', text)
        
        three_words = self.group_three_words()
        self.processed_data = self.extract_eng_text(three_words)
        self.processed_data['Text'] = self.processed_data['Text'].map(remove_urls)
        self.processed_data['time'] = pd.to_datetime(self.processed_data['time'])
        self.processed_data.sort_values(by="time",na_position="first",inplace = True)
        if(self.verbose):
            print("Strating the process of grouping the text by hour.......")
        self.processed_data['hour_data'] = self.processed_data['time'].dt.hour
        self.before_group = self.processed_data
        self.processed_data = self.processed_data.groupby([self.processed_data['time'].dt.date,self.processed_data['hour_data']])['Text'].agg(lambda x: ' '.join(x)).reset_index()
        if(self.verbose):
            print("Done with preprocessing!!!")
        
        return self
    
    def fill_missing_hours(self):
        grouped_data = self.processed_data
        grouped_data['time'] = pd.to_datetime(grouped_data['time'])
        grouped_data['hour_data'] = grouped_data['hour_data'].astype(int)
        grouped_data['full_time'] = grouped_data['time'] + pd.to_timedelta(grouped_data['hour_data'], unit='h')
        date_range = pd.date_range(start='2023-01-01', end='2023-01-31 23:00:00', freq='h')
        all_hours_df = pd.DataFrame(date_range, columns=['full_time'])
        merged_df = pd.merge(all_hours_df[['full_time']], grouped_data[['full_time', 'Text']], on='full_time', how='left')
        merged_df['Text'] = merged_df['Text'].ffill()
        return merged_df

In [150]:
df_path = "/kaggle/input/jan-2023-raw/jan_2023.csv"
df = Dataset(df_path,verbose=True)
final_data = df.group_hour_data().fill_missing_hours()

Grouping the 1st three words in each text.......
Done with Grouping the 1st three words in each text!!!
Checking for english text........
None English text removed!!!!
Strating the process of grouping the text by hour.......
Done with preprocessing!!!


In [148]:
final_data.to_csv("/kaggle/working/2023_jan(3)")

In [None]:
class Openai:
    def __init__(self,dataset):
        self.data = dataset
    
    def get_text_summary(self,start,end):
        processed_data = self.data.iloc[start:end].reset_index()
        client = OpenAI(
          api_key="YOUR_API_KEY"
        )
        model = "gpt-3.5-turbo"
        content = "You are a very good text summarizer for bitcoin related news helping in accurate price prediction in 150 words"
        
        for i,t in enumerate(tqdm(processed_data['Text'])):
            text = str(processed_data.loc[i,'Text'])
            try: 
                completion = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": content},
                    {"role": "user", "content": text}
                  ]
                )
            except:
                print(i)
            processed_data.loc[i,'Summarized_text'] = str(completion.choices[0].message.content)
        return processed_data

In [156]:
oa = Openai(final_data)
processed_data = oa.get_text_summary(100,200)

100%|██████████| 100/100 [32:36<00:00, 19.56s/it]


In [158]:
processed_data.to_csv("/kaggle/working/2023_jan_summarized_100.csv")

In [None]:
from openai import OpenAI
import openai
client = OpenAI(
  api_key="YOUR_API_KEY"
)

In [None]:
# import pandas as pd
# train_df = pd.read_csv("/kaggle/input/train-test-2021-jul/text_train.csv")
# test_df = pd.read_csv("/kaggle/input/train-test-2021-jul/text_val.csv")

In [166]:
import time
def get_embedding(text, model="text-embedding-3-small"):
   time.sleep(20)
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [162]:
processed_data['ada_embedding'] = processed_data['Summarized_text'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
processed_data.to_csv('/kaggle/working/jan_2023_embedded_100.csv', index=False)

In [167]:
processed_data2 = pd.read_csv("/kaggle/working/2023_jan_summarized.csv")

In [168]:
processed_data2['ada_embedding'] = processed_data2['Summarized_text'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
processed_data2.to_csv('/kaggle/working/jan_2023_embedded.csv', index=False)