In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os

# Add the path to the folder containing ClusterAndPredict
module_path = os.path.abspath(os.path.join('/home/ec2-user/SageMaker', 'factcheck-model-v2'))

if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from ClusterAndPredict.ClusterAndPredict import ClusterAndPredict
from Testing.DataLoader import DataLoader
from Testing.ParameterCreator import ParameterCreator
from Clustering.Helpers.Visualizer import Visualizer

In [None]:
# DEPRECATED
# chroma_client = chromadb.PersistentClient(path="./../../Clustering/Clustering/Chroma")
# # Count number of collections
# print(chroma_client.count_collections())
# 
# # Get all collection names
# collection_names = chroma_client.list_collections()
# 
# # Loop through each collection and drop it
# for collection_name in collection_names:
#     if collection_name.name != 'climate_claims_embeddings_unchanged':
#         chroma_client.delete_collection(collection_name.name)

In [None]:
params = ParameterCreator().get_parameters()
results = []
cluster_dfs = []
print("Number of experiments to run: ", len(params))
for param in params:
    percentage = 0.75
    data_loader = DataLoader(percentage, True, param['random_seed'])
    use_only_card = param['use_only_CARD']
    size_of_dataset = param['size_of_dataset']
    del param['size_of_dataset']
    del param['use_only_CARD']

    train_df, test_df = data_loader.create_train_test_df(False, False, False, True, size_of_dataset)
    if use_only_card:
        print("using card")
        train_df, test_df = data_loader.create_train_test_df(True, True, True, True, size_of_dataset)

    clf = ClusterAndPredict(**param, train_df=train_df)
    clf.fit(test_df['Text'].tolist(), test_df['Numerical Rating'].tolist())
    # Print best parameters
    best_estimator = clf
    score = best_estimator.score([], [])
    print(best_estimator.get_all_performance_metrics())
    object_output = best_estimator.get_all_performance_metrics()
    cluster_df = object_output['cluster_df']
    cluster_dfs.append(cluster_df)
    output = {
        'percentage': percentage,
        'score': score,
        'accuracy': best_estimator.get_accuracy(),
        'was_supervised_umap_used': best_estimator.get_was_supervised(),
        'metrics': best_estimator.get_all_performance_metrics(),
        'size_of_dataset': size_of_dataset,
        'use_only_CARD': use_only_card
    }
    # Prepend the value 'param' to the keys in params
    for key, value in param.items():
        local_key = 'params.' + key
        output[local_key] = value

    results.append(output)

In [None]:
viz = Visualizer()
df_with_two_dimens = viz.fit_transform(cluster_dfs[0], 'embeddings')
df_with_two_dimens

In [None]:
# 文件输入输出模板

import boto3

# 创建 S3 客户端
s3 = boto3.client('s3')

bucket_name = 'sagemaker-us-east-1-390403859474'

s3_file_key = 'processed_file.csv'  # S3 中的文件路径

local_file_path = 'downloaded_file.csv'  # 本地保存的文件名

# 下载文件
s3.download_file(bucket_name, s3_file_key, local_file_path)

print(f"File {s3_file_key} has been downloaded to {local_file_path}.")

processed_file_path = f'results_${str(now)}.csv'
results_df.to_csv(processed_file_path, index=False)



# 定义上传路径
upload_file_key = 'processed_file.csv'  # 上传到 S3 的路径

# 上传文件
s3.upload_file(processed_file_path, bucket_name, upload_file_key)

print(f"File {processed_file_path} has been uploaded to s3://{bucket_name}/{upload_file_key}.")

Zapier

In [None]:
best_estimator.clusters_df.head(100).to_csv("test1.csv", index=False)

In [None]:
import pandas as pd
import os
import boto3
from datetime import datetime
import io
import re
import time

# Get access, secret key, and bucket name from environment variables
AWS_ACCESS_KEY = os.environ['AWS_ACCESS_KEY']
AWS_SECRET_KEY = os.environ['AWS_SECRET_KEY']
BUCKET_NAME = os.environ['BUCKET_NAME']

# Data File
CSV_FILE_PATH = 'test1.csv'

# Connect to S3
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY
)

# Create local temp file
TEMP_DIR = 'temp_files'
os.makedirs(TEMP_DIR, exist_ok=True)

# Read CSV file and get column name
data = pd.read_csv(CSV_FILE_PATH)

batch_size = 10
total_batches = (len(data) + batch_size - 1) // batch_size
upload_count = 0

for i in range(total_batches):
    
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(data))
    df = data.iloc[start_idx:end_idx]
    column_names = df.columns
    
    # Create files contains single claim
    for index, row in df.iterrows():
        text = str(df.loc[index, 'text'])
        filtered_text = ''.join(re.findall(r'[A-Za-z0-9]', text))
        sub_name = filtered_text[:30]
        file_name = f"claim_{sub_name}.csv"
        file_path = os.path.join(TEMP_DIR, file_name)
        
        single_row_df = pd.DataFrame([row], columns=column_names)
        single_row_df.to_csv(file_path, index=False)
        
        s3_key = f"groundtruth/{file_name}"
        
        # Check if the file exists in S3
        try:
            s3_client.head_object(Bucket=BUCKET_NAME, Key=s3_key)
            # File exists, download it
            s3_object = s3_client.get_object(Bucket=BUCKET_NAME, Key=s3_key)
            s3_data = pd.read_csv(io.BytesIO(s3_object['Body'].read()))
            
            # Compare the data
            if single_row_df.equals(s3_data):
                print(f"{file_name} is up-to-date. Skipping upload.")
                continue
            else:
                print(f"{file_name} is outdated. Uploading new version.")
        except s3_client.exceptions.ClientError as e:
            if e.response['Error']['Code'] == '404':
                # File does not exist
                print(f"{file_name} does not exist in S3. Uploading new file.")
            else:
                raise
        
        # Upload to S3
        s3_client.upload_file(
            Filename=file_path,
            Bucket=BUCKET_NAME,
            Key=s3_key  # S3 path
        )
        print(f"Uploaded {file_name} to S3.")
        upload_count += 1

        # Rest if upload count equals batch_size
        if upload_count == batch_size:
            print("Rest for 180s...")
            time.sleep(180)
            upload_count = 0

# Delete temp files
for file in os.listdir(TEMP_DIR):
    os.remove(os.path.join(TEMP_DIR, file))
os.rmdir(TEMP_DIR)

print("All files uploaded successfully!")

In [None]:
import boto3
import pandas as pd
import gzip
import json

# Get S3 and object key from environment variables
bucket = os.environ['BUCKET_S3']
bucket_key = os.environ['BUCKET_S3_KEY']


# 使用 boto3 客户端
s3 = boto3.client('s3')

# 下载文件到本地
s3.download_file(bucket, key, 'data.json.gz')

# 解压缩并逐行读取数据
data = []
with gzip.open('data.json.gz', 'rt') as f:
    for line in f:
        data.append(json.loads(line))

# 如果数据为 DataFrame 格式的，可以转为 pandas DataFrame
df = pd.json_normalize(data)

df.to_csv("GoogleFactCheckData.csv")

# 查看数据
print(df.head())
