In [16]:
from datetime import datetime

# Chuyển đổi Unix timestamp sang định dạng ngày/giờ
def convert_unix_to_readable(unix_timestamp):
    return datetime.utcfromtimestamp(unix_timestamp).strftime('%Y-%m-%d %H:%M:%S')

# Ví dụ với dấu thời gian 1696896002.0 và 1696897202.0
timestamp1 = 1701301802.0
timestamp2 = 1690848002.0

readable_time1 = convert_unix_to_readable(timestamp1)
readable_time2 = convert_unix_to_readable(timestamp2)

print(readable_time1)  # In thời gian dễ hiểu của timestamp1
print(readable_time2)  # In thời gian dễ hiểu của timestamp2


2023-11-29 23:50:02
2023-08-01 00:00:02


Chuyển đổi file JSON sang CSV

In [1]:
# Đọc và phân tích cấu trúc file JSON:

import json

# Đọc file JSON
file_path = '../dataset/final_merged_data.json'
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Hiển thị cấu trúc của file JSON
json_data.keys(), type(json_data), {key: type(json_data[key]) for key in json_data.keys()}


(dict_keys(['schedule', 'oa_temp', 'room_temp', 'slab_temp', 'dew_temp', 'fan_stat', 'room_sp_value', 'room_sp_range', 'slab_sp_value', 'slab_sp_range', 'thermal_comfort_data', 'issues_count', 'potential_savings']),
 dict,
 {'schedule': dict,
  'oa_temp': dict,
  'room_temp': dict,
  'slab_temp': dict,
  'dew_temp': dict,
  'fan_stat': dict,
  'room_sp_value': str,
  'room_sp_range': str,
  'slab_sp_value': str,
  'slab_sp_range': str,
  'thermal_comfort_data': dict,
  'issues_count': dict,
  'potential_savings': dict})

In [2]:
# Kiểm tra cấu trúc của một trong các dictionary con:

example_key = 'schedule'
json_data[example_key]

{'Monday': ['08:00', '17:00'],
 'Tuesday': ['08:00', '17:00'],
 'Wednesday': ['08:00', '17:00'],
 'Thursday': ['08:00', '17:00'],
 'Friday': ['08:00', '17:00'],
 'Saturday': ['08:00', '17:00'],
 'Sunday': ['08:00', '17:00'],
 'Public Holiday': ['08:00', '17:00']}

In [3]:
# Chuẩn bị hàm để làm phẳng dữ liệu JSON:

def flatten_json(json_obj):
    flattened_data = {}
    for key, value in json_obj.items():
        if isinstance(value, dict):
            for sub_key, sub_value in value.items():
                new_key = f'{key}_{sub_key}'
                flattened_data[new_key] = sub_value
        else:
            flattened_data[key] = value
    return flattened_data

flattened_json_data = flatten_json(json_data)
list(flattened_json_data.items())[:5]


[('schedule_Monday', ['08:00', '17:00']),
 ('schedule_Tuesday', ['08:00', '17:00']),
 ('schedule_Wednesday', ['08:00', '17:00']),
 ('schedule_Thursday', ['08:00', '17:00']),
 ('schedule_Friday', ['08:00', '17:00'])]

In [4]:
# Chuyển đổi dữ liệu JSON làm phẳng sang định dạng DataFrame và chuẩn bị cho CSV:

import pandas as pd

rows = []
for key, value in flattened_json_data.items():
    if isinstance(value, dict):
        for timestamp, timestamp_value in value.items():
            row = next((row for row in rows if row.get('timestamp') == timestamp), None)
            if row is None:
                row = {'timestamp': timestamp}
                rows.append(row)
            row[key] = timestamp_value
    else:
        for row in rows:
            row[key] = value

df = pd.DataFrame(rows)
df[200:220]


KeyboardInterrupt: 

In [19]:
parquet_file_path = '../dataset/converted_data.parquet'
df.to_parquet(parquet_file_path, index=False)

In [3]:
import json
import pandas as pd

def process_json(file_path):
    # Đọc file JSON
    with open(file_path, 'r') as file:
        json_data = json.load(file)

    # Danh sách các keys không cần xử lý
    ignored_keys = ['schedule', 'thermal_comfort_data', 'issues_count', 'potential_savings']

    # Tạo danh sách DataFrame tạm thời
    dfs = []

    # Duyệt qua từng key trong dữ liệu JSON
    for key, value in json_data.items():
        if key in ignored_keys or not isinstance(value, dict):
            continue

        # Tạo DataFrame từ từng phần của JSON
        for subkey, subvalue in value.items():
            if isinstance(subvalue, dict):
                temp_df = pd.DataFrame.from_dict(subvalue, orient='index')
                temp_df.index.name = 'timestamp'
                temp_df.reset_index(inplace=True)
                temp_df.columns = ['timestamp', f'{key}_{subkey}']
                dfs.append(temp_df)

    # Hợp nhất các DataFrame tạm thời dựa trên cột timestamp
    final_df = dfs[0]
    for df in dfs[1:]:
        final_df = pd.merge(final_df, df, on='timestamp', how='outer')

    return final_df

# Áp dụng cho file data_prepare_1.json
file_path_1 = '../dataset/data_prepare_2.json'
df1 = process_json(file_path_1)

# Áp dụng cho file data_prepare_8.json
file_path_8 = '../dataset/data_prepare_8.json'
df8 = process_json(file_path_8)

# Hiển thị DataFrame đầu tiên để kiểm tra
df1.head()


Unnamed: 0,timestamp,room_temp_Mf-1-1,room_temp_Mf-1-2,room_temp_Mf-1-3,room_temp_Mf-2-1-1,room_temp_Mf-2-1-2,room_temp_Mf-2-1-3,room_temp_Mf-2-1-4,room_temp_Ac-2-1,slab_temp_Plant,...,dew_temp_Mf-2-1,dew_temp_Ac-2-2,dew_temp_Ac-2-3,dew_temp_Ac-2-4,dew_temp_Ac-2-5,fan_stat_Ac-2-1,fan_stat_Ac-2-2,fan_stat_Ac-2-3,fan_stat_Ac-2-4,fan_stat_Ac-2-5
0,1682899448.0,16.1,16.2,16.8,17.9,18.2,17.1,16.4,16.0,16.1,...,7.6,7.6,7.6,7.6,7.6,0.0,0.0,0.0,0.0,0.0
1,1682900648.0,16.1,16.2,16.8,17.9,18.3,17.0,16.4,16.0,16.1,...,7.6,7.6,7.6,7.6,7.6,0.0,0.0,0.0,0.0,0.0
2,1682901848.0,16.1,16.1,16.7,17.9,18.1,17.0,16.3,15.9,16.0,...,7.5,7.5,7.5,7.5,7.5,0.0,0.0,1.0,1.0,0.0
3,1682903048.0,16.0,16.1,16.7,17.8,18.1,16.9,16.2,15.8,16.0,...,7.5,7.5,7.5,7.5,7.5,0.0,0.0,0.0,1.0,0.0
4,1682904248.0,16.0,16.1,16.6,17.8,18.1,16.9,16.1,15.8,16.0,...,7.5,7.5,7.5,7.5,7.5,0.0,0.0,0.0,0.0,0.0


In [4]:
df8.head()

Unnamed: 0,timestamp,room_temp_Ac-2-1,room_temp_Mf-2-1-4,room_temp_Mf-2-1-3,room_temp_Mf-1-2,room_temp_Mf-2-1-2,room_temp_Mf-1-1,room_temp_Mf-2-1-1,room_temp_Mf-1-3,slab_temp_Zone-1,...,dew_temp_Mf-2-1,dew_temp_Ac-2-2,dew_temp_Ac-2-3,dew_temp_Ac-2-4,dew_temp_Ac-2-5,fan_stat_Ac-2-1,fan_stat_Ac-2-2,fan_stat_Ac-2-3,fan_stat_Ac-2-4,fan_stat_Ac-2-5
0,1701389348.0,19.8,19.5,20.4,20.0,21.5,19.3,21.3,19.8,20.1,...,12.5,12.5,12.5,12.5,12.5,0.0,0.0,0.0,0.0,0.0
1,1701390548.0,19.8,19.4,20.4,19.9,21.5,19.3,21.2,19.7,20.0,...,12.5,12.5,12.5,12.5,12.5,0.0,0.0,0.0,0.0,0.0
2,1701391748.0,19.7,19.4,20.4,19.8,21.4,19.2,21.2,19.7,20.0,...,12.5,12.5,12.5,12.5,12.5,0.0,0.0,0.0,0.0,0.0
3,1701392948.0,19.6,19.3,20.3,19.8,21.4,19.2,21.2,19.6,20.0,...,12.5,12.5,12.5,12.5,12.5,0.0,0.0,0.0,0.0,0.0
4,1701394148.0,19.6,19.3,20.2,19.7,21.4,19.2,21.1,19.6,19.9,...,12.7,12.7,12.7,12.7,12.7,0.0,0.0,0.0,0.0,0.0


In [5]:
parquet_file_path = '../dataset/data_prepare_2.parquet'
df1.to_parquet(parquet_file_path, index=False)

In [6]:
parquet_file_path = '../dataset/data_prepare_8.parquet'
df8.to_parquet(parquet_file_path, index=False)

Code merge file JSON

In [None]:
def merge_dicts_simple(d1, d2):
    """
    Merge two dictionaries without expanding values into lists. Merge only at the first level of each key.
    """
    merged = dict(d1)  # Start with the keys and values of the first dictionary
    for key, value in d2.items():
        if key in merged and isinstance(value, dict) and isinstance(merged[key], dict):
            # Merge dictionaries at the next level
            merged[key] = merge_dicts_simple(merged[key], value)
        else:
            # Overwrite or add the key-value pair
            merged[key] = value
    return merged



In [None]:
# Load the additional files
file_paths = [
    '/mnt/data/data_prepare_2.json',
    '/mnt/data/data_prepare_3.json',
    '/mnt/data/data_prepare_4.json',
    '/mnt/data/data_prepare_5.json',
    '/mnt/data/data_prepare_6.json',
    '/mnt/data/data_prepare_7.json',
    '/mnt/data/data_prepare_8.json'
]

additional_data = []

for path in file_paths:
    with open(path, 'r') as file:
        additional_data.append(json.load(file))

# Merge each of these datasets into the already merged data
for data in additional_data:
    merged_data_simple = merge_dicts_simple(merged_data_simple, data)

# Save the newly merged data to a file
final_merged_file_path = '/mnt/data/final_merged_data.json'
with open(final_merged_file_path, 'w') as file:
    json.dump(merged_data_simple, file, indent=2)

final_merged_file_path

Summarizer Test

In [8]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

ARTICLE = """ Once upon a time, there was a small, curious field mouse named Pip. Pip lived in a burrow in a vast, open field. Every night, Pip would scurry out and look up at the twinkling stars and the radiant moon. One day, Pip decided that he was going to find a way to reach the moon. He knew he was small, but he believed he could make it if he found the right help.

The next day, Pip came across Charlie, the rabbit, hopping around in the field. Charlie was certainly taller than Pip, so he asked, "Charlie, you're so tall! How far are you from the moon?"

Charlie laughed, "Well, Pip, I may be taller than you, but I'm nowhere near tall enough to reach the moon!"

Unfazed, Pip asked Charlie if he could try standing on his head to gauge the height. Chuckling, Charlie bent down, allowing Pip to scamper onto his fluffy ears. Yet, even standing on top of Charlie's head, the moon still seemed so far away.

Feeling slightly disappointed but not defeated, Pip thanked Charlie and continued his journey. Next, he met Oliver, the deer. With his large antlers reaching high into the sky, Oliver was certainly taller than Charlie.

"Oliver," Pip asked, "how far is your reach to the moon?"

"Although my antlers are tall," Oliver replied, "they're not nearly tall enough to touch the moon."

Eager to try, Pip asked if he could stand on Oliver's antlers. Carefully, Oliver bent down, and Pip, with a bit of effort, climbed atop his antlers. But still, the moon remained out of Pip's reach.

Determined, Pip moved on and stumbled upon Gary, the giraffe. With his long neck and high reach, Gary was the tallest animal Pip had ever seen.

"Gary," said Pip, panting slightly, "how far are you from the moon?"

Gary smiled down at Pip, "I may be the tallest here, Pip, but even I cannot reach the moon."

Despite this, Pip was eager to try. With Gary's help, Pip climbed up and sat on top of Gary's head. The view was breathtaking, but yet again, the moon was far from his reach.

Pip was starting to feel disheartened, but he was not ready to give up. That night, he sat in the field, looking at the moon, wondering how he could get there.

Just then, an old wise owl named Otto flew down beside him. "Pip, why the long face?" he asked.

"I want to touch the moon," Pip confessed. "But no matter how high I go, it's still so far away."

Otto chuckled softly. "Dear Pip, the moon is not something you can reach by height. It's far, far away, higher than any animal can go. But you know, Pip, each night when you look at the moon and marvel at its beauty, you're touching it with your eyes and your heart."

Pip was silent for a while, looking thoughtfully at the moon. Then, he smiled, realizing that he had been closer to the moon than he thought.

From that night forward, Pip knew that even though he couldn't physically touch the moon, he could feel its magic. His dream of reaching the moon had taken him on a great adventure, where he'd met new friends and seen things from heights he'd never imagined.

Pip may have been a small field mouse, but his dreams were as big and bright as the moon itself.

"""
print(summarizer(ARTICLE, max_length=200, min_length=30, do_sample=False))


[{'summary_text': 'A small, curious field mouse named Pip wanted to reach the moon. He decided that he could if he found the right help. Pip met Charlie, the rabbit, and Oliver, the deer. Gary, the giraffe, was the tallest animal Pip had ever seen. Otto, the owl, told Pip that the moon is higher than any animal can go.'}]


In [52]:
from transformers import AutoTokenizer, LongT5ForConditionalGeneration

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/long-t5-tglobal-base")
model = LongT5ForConditionalGeneration.from_pretrained("google/long-t5-tglobal-base")

# Đoạn văn bản cần tóm tắt
text = """Describe the following data: At time 2023/03/27-21:34:59 the room temperature was 20.0 At time 2023/03/27-21:43:07 the room temperature was 19.9 At time 2023/03/27-21:49:07 the room temperature was 19.9 At time 2023/03/27-22:09:07 the room temperature was 19.9 At time 2023/03/27-22:29:07 the room temperature was 19.8 At time 2023/03/27-22:49:07 the room temperature was 19.8 At time 2023/03/27-23:09:07 the room temperature was 19.8 At time 2023/03/27-23:29:07 the room temperature was 19.8 At time 2023/03/27-23:49:07 the room temperature was 19.8 At time 2023/03/28-00:09:07 the room temperature was 19.8 At time 2023/03/28-00:29:08 the room temperature was 19.7 At time 2023/03/28-00:49:07 the room temperature was 19.7 At time 2023/03/28-01:09:07 the room temperature was 19.6 At time 2023/03/28-01:29:07 the room temperature was 19.6 At time 2023/03/28-01:49:07 the room temperature was 19.6 At time 2023/03/28-02:09:07 the room temperature was 19.6 At time 2023/03/28-02:29:08 the room temperature was 19.5 At time 2023/03/28-02:49:07 the room temperature was 19.5 At time 2023/03/28-03:09:07 the room temperature was 19.5 At time 2023/03/28-03:29:07 the room temperature was 19.5.
"""

# Mã hóa đoạn văn bản
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

# Tạo ra tóm tắt
summary_ids = model.generate(inputs.input_ids, max_length=350, min_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)

# Giải mã kết quả để nhận được văn bản tóm tắt
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)


Describe the following data: At time 2023/03/27-21:34:59 the room temperature was 20.0 At time 2023/03/27-21:43:07 the room temperature was 19.9 At time 2023/03/27-21:49:07 the room temperature was 19.9 At time 2023/03/27-22:09:07 the room temperature was 19.9 At time 2023/03/27-22:49:07 the room temperature was 19.8 At time 2023/03/27-23:09:07 the room temperature was 19.8 At time 2023/03/27-23:29:07 the room temperature was 19.8 At time 2023/03/27-23:49:07 the room temperature was 19.8 At time 2023/03/28-00:09:07 the room temperature was 19.8 At time 2023/03/28-00:29:08 the room temperature was 19.7 At time 2023/03/28-00:49:07 the room temperature was 19.7 At time 2023/03/28-01:09:07 the room temperature was 19.6 At time 2023/03/28-01:49:07 the room temperature was 19.6 At time 2023/03/28-02:09:07 the room temperature was 19.6 At time 2023/03/28-02:29:08 the room temperature was 19.5 At time 2023/03/28-02:49:07 the room temperature was 19.5 At time 2023/03/28-03:09:07 the room temper

In [53]:
from transformers import pipeline

# Khởi tạo pipeline tóm tắt với mô hình BART
# Test các model: FLAN-T5, CNN_Falcon_7b
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Đoạn văn bản cần tóm tắt
article = """ Describe the following data: At time 2023/03/27-21:34:59 the room temperature was 20.0 At time 2023/03/27-21:43:07 the room temperature was 19.9 At time 2023/03/27-21:49:07 the room temperature was 19.9 At time 2023/03/27-22:09:07 the room temperature was 19.9 At time 2023/03/27-22:29:07 the room temperature was 19.8 At time 2023/03/27-22:49:07 the room temperature was 19.8 At time 2023/03/27-23:09:07 the room temperature was 19.8 At time 2023/03/27-23:29:07 the room temperature was 19.8 At time 2023/03/27-23:49:07 the room temperature was 19.8 At time 2023/03/28-00:09:07 the room temperature was 19.8 At time 2023/03/28-00:29:08 the room temperature was 19.7 At time 2023/03/28-00:49:07 the room temperature was 19.7 At time 2023/03/28-01:09:07 the room temperature was 19.6 At time 2023/03/28-01:29:07 the room temperature was 19.6 At time 2023/03/28-01:49:07 the room temperature was 19.6 At time 2023/03/28-02:09:07 the room temperature was 19.6 At time 2023/03/28-02:29:08 the room temperature was 19.5 At time 2023/03/28-02:49:07 the room temperature was 19.5 At time 2023/03/28-03:09:07 the room temperature was 19.5 At time 2023/03/28-03:29:07 the room temperature was 19.5.
"""

# Tóm tắt văn bản
summary = summarizer(article, max_length=250, min_length=100, do_sample=False)

# In ra tóm tắt
print(summary[0]['summary_text'])


At time 2023/03/27-21:34:59 the room temperature was 20.0. At time 2024/02/08/09/10/11/12/13/14/15/16/17/18/19/20/21/22/23/24/25/26/27/28/29/30/31/2/3/4/5/6/7/8/9/10, 11/12, 12/13, 13/14, 14/15, 15/16, 16/17, 17/18, 18/19, 19/20, 19.3, 19, 20, 21/22, 22/23, 23/24, 24/25, 25/26, 26/27, 27/28, 28/29, 29/30, 30/31, 31/32, 32/33, 33/34, 34/36, 36/37, 37/38, 38/39, 39/40, 40/41, 41/42, 42/43, 43/44, 44/45, 45/46, 46/47, 47/48, 48/49,


In [None]:
import pandas as pd
from orion import Orion
from transformers import GPT3Tokenizer, GPT3Model

# Bước 1: Tải dữ liệu
data = pd.read_csv("data.csv")

# Bước 2: Phát hiện bất thường
# Lưu ý: Orion cần được cấu hình chính xác theo nhu cầu của dữ liệu
orion = Orion(pipeline='lstm_dynamic_threshold')
anomalies = orion.fit_detect(data)

# Bước 3: Chuẩn bị dữ liệu cho mô hình GPT-3
# Chuyển DataFrame thành numpy array
data_np = data.to_numpy()

# Tokenize dữ liệu (đối với mô hình LLM như GPT-3)
tokenizer = GPT3Tokenizer.from_pretrained("gpt3")
input_ids = tokenizer(data_np.tolist(), return_tensors='pt', padding=True, truncation=True)['input_ids']

# Bước 4: Sử dụng mô hình GPT-3 để tạo báo cáo
# Lưu ý: GPT-3 thường được sử dụng qua API, ở đây chỉ là ví dụ về cách load mô hình
model = GPT3Model.from_pretrained("gpt3")
outputs = model.generate(input_ids)
report = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(report)


In [51]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

inputs = tokenizer("""Describe the following data of room temperature with a summary of the trends, patterns, and any notable observations: Describe the following data: Timestamp|2023/03/27-21:34:59 room_temperature|20.0 Timestamp|2023/03/27-21:43:07 room_temperature|19.9 Timestamp|2023/03/27-21:49:07 room_temperature|19.9 Timestamp|2023/03/27-22:09:07 room_temperature|19.9 Timestamp|2023/03/27-22:29:07 room_temperature|19.8 Timestamp|2023/03/27-22:49:07 room_temperature|19.8 Timestamp|2023/03/27-23:09:07 room_temperature|19.8 Timestamp|2023/03/27-23:29:07 room_temperature|19.8 Timestamp|2023/03/27-23:49:07 room_temperature|19.8 Timestamp|2023/03/28-00:09:07 room_temperature|19.8 Timestamp|2023/03/28-00:29:08 room_temperature|19.7 Timestamp|2023/03/28-00:49:07 room_temperature|19.7 Timestamp|2023/03/28-01:09:07 room_temperature|19.6 Timestamp|2023/03/28-01:29:07 room_temperature|19.6 Timestamp|2023/03/28-01:49:07 room_temperature|19.6 Timestamp|2023/03/28-02:09:07 room_temperature|19.6 Timestamp|2023/03/28-02:29:08 room_temperature|19.5 Timestamp|2023/03/28-02:49:07 room_temperature|19.5 Timestamp|2023/03/28-03:09:07 room_temperature|19.5 Timestamp|2023/03/28-03:29:07 room_temperature|19.5.
""", return_tensors="pt")

# Sinh đầu ra với độ dài tối đa được chỉ định
max_length = 512  # Ví dụ, tăng giá trị này để có đầu ra dài hơn
min_length = 50   # Đặt một giới hạn dài tối thiểu
length_penalty = 2.0 # Tăng giá trị này để khuyến khích đầu ra dài hơn

outputs = model.generate(**inputs, max_length=max_length, min_length=min_length, length_penalty=length_penalty)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['The room temperature was 20.0 degrees Fahrenheit. The time stamps were 20:34:59 and 21:34:59 respectively. The time stamps were 20:43:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:49:07 respectively. The time stamps were 20:43:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21:43:07 respectively. The time stamps were 20:49:07 and 21

In [55]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

input_text = """ 
Describe the following data: At time 2023/03/27-21:34:59 the room temperature was 20.0 At time 2023/03/27-21:43:07 the room temperature was 19.9 At time 2023/03/27-21:49:07 the room temperature was 19.9 At time 2023/03/27-22:09:07 the room temperature was 19.9 At time 2023/03/27-22:29:07 the room temperature was 19.8 At time 2023/03/27-22:49:07 the room temperature was 19.8 At time 2023/03/27-23:09:07 the room temperature was 19.8 At time 2023/03/27-23:29:07 the room temperature was 19.8 At time 2023/03/27-23:49:07 the room temperature was 19.8 At time 2023/03/28-00:09:07 the room temperature was 19.8 At time 2023/03/28-00:29:08 the room temperature was 19.7 At time 2023/03/28-00:49:07 the room temperature was 19.7 At time 2023/03/28-01:09:07 the room temperature was 19.6 At time 2023/03/28-01:29:07 the room temperature was 19.6 At time 2023/03/28-01:49:07 the room temperature was 19.6 At time 2023/03/28-02:09:07 the room temperature was 19.6 At time 2023/03/28-02:29:08 the room temperature was 19.5 At time 2023/03/28-02:49:07 the room temperature was 19.5 At time 2023/03/28-03:09:07 the room temperature was 19.5 At time 2023/03/28-03:29:07 the room temperature was 19.5.
"""
inputs = tokenizer(input_text, return_tensors="pt")

# Sinh đầu ra với độ dài tối đa được chỉ định
max_length = 512  # Ví dụ, tăng giá trị này để có đầu ra dài hơn
min_length = 100   # Đặt một giới hạn dài tối thiểu
length_penalty = 4.0 # Tăng giá trị này để khuyến khích đầu ra dài hơn

outputs = model.generate(**inputs, max_length=max_length, min_length=min_length)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['At time 2023/03/28-03:29:07 the room temperature was 19.5, at time 2023/03/28-00:09:07 the room temperature was 19.8, at time 2023/03/28-00:29:08 the room temperature was 19.7, at time 2023/03/28-00:49:07 the room temperature was 19.7, at time 2023/03/28-01:09:07 the room temperature was 19.6, at time 2023/03/28-01:29:07 the room temperature was 19.6, at time 2023/03/28-01:49:07 the room temperature was 19.6, at time 2023/03/28-02:09:07 the room temperature was 19.6, at time 2023/03/28-02:29:08 the room temperature was 19.6, at time 2023/03/28-03:09:07 the room temperature was 19.5, at time 2023/03/28-03:29:07 the room temperature was 19.5,']


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

# Prepare your input text
input_text = """
Generate a report analyzing the following time-series data. Each entry consists of a timestamp and a corresponding measurement value. 
The data is as follows: At timestamp 1690910402.0, the value was 20.5. At timestamp 1690911602.0, the value was 20.0. At timestamp 1690912802.0, the value was 20.0. At timestamp 1690914002.0, the value was 20.0. At timestamp 1690915203.0, the value was 19.5. At timestamp 1690916403.0, the value was 19.5. At timestamp 1690917602.0, the value was 19.5. At timestamp 1690918802.0, the value was 19.0. At timestamp 1690920002.0, the value was 19.0. At timestamp 1690921202.0, the value was 19.0. At timestamp 1690922402.0, the value was 19.0. At timestamp 1690923602.0, the value was 19.0. At timestamp 1690924802.0, the value was 18.5. At timestamp 1690926002.0, the value was 18.5. At timestamp 1690927202.0, the value was 18.0. At timestamp 1690928402.0, the value was 18.0. At timestamp 1690929602.0, the value was 18.0. At timestamp 1690930802.0, the value was 18.0. At timestamp 1690932002.0, the value was 18.0. At timestamp 1690933202.0, the value was 17.5. At timestamp 1690934402.0, the value was 17.5. At timestamp 1690935602.0, the value was 17.5. At timestamp 1690936802.0, the value was 17.5. At timestamp 1690938002.0, the value was 17.0. At timestamp 1690939203.0, the value was 17.0. At timestamp 1690940402.0, the value was 17.0. At timestamp 1690941602.0, the value was 17.0. 
Conclude the report with a summary of the trends, patterns, and any notable observations.
"""

# Encode the input text
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate a response
outputs = model.generate(input_ids)

# Decode and print the response
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards:   0%|          | 0/2 [01:01<?, ?it/s]


KeyboardInterrupt: 

In [32]:
from transformers import pipeline

summarizer = pipeline("summarization", model="Falconsai/text_summarization")

ARTICLE = """ 
At time 2023/03/27-21:34:59 the room temperature was 19.4. At time 2023/03/27-21:43:07 the room temperature was 19.4. At time 2023/03/27-21:49:07 the room temperature was 19.4. At time 2023/03/27-22:09:07 the room temperature was 19.3. At time 2023/03/27-22:29:07 the room temperature was 19.3. At time 2023/03/27-22:49:07 the room temperature was 19.3. At time 2023/03/27-23:09:07 the room temperature was 19.4. At time 2023/03/27-23:29:07 the room temperature was 19.4. At time 2023/03/27-23:49:07 the room temperature was 19.4. At time 2023/03/28-00:09:07 the room temperature was 19.3. At time 2023/03/28-00:29:08 the room temperature was 19.3. At time 2023/03/28-00:49:07 the room temperature was 19.3. At time 2023/03/28-01:09:07 the room temperature was 19.3. At time 2023/03/28-01:29:07 the room temperature was 19.3. At time 2023/03/28-01:49:07 the room temperature was 19.2. At time 2023/03/28-02:09:07 the room temperature was 19.2. At time 2023/03/28-02:29:08 the room temperature was 19.2. At time 2023/03/28-02:49:07 the room temperature was 19.2. At time 2023/03/28-03:09:07 the room temperature was 19.2. At time 2023/03/28-03:29:07 the room temperature was 19.2.
"""
print(summarizer(ARTICLE, max_length=512, min_length=30, do_sample=False))


Your max_length is set to 512, but your input_length is only 354. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=177)


[{'summary_text': 'At time 2023/03/28-02:09:07 the room temperature was 19.2 . At time 1923/03/27-03:29:07 at time 192/03/28-04:09 . The room temperature is 19.2 at time 2022/03/28-01:29.08 the room was 19.2. At time . 2023/28-03 :09,07 the . room temperature had 19.2 by 2023 . By 2023/05/28-05:29 .'}]


In [44]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("allenai/PRIMERA")
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/PRIMERA")

# Đoạn văn bản cần tóm tắt
text = """At time 2023/03/27-21:34:59 the room temperature was 20.0 At time 2023/03/27-21:43:07 the room temperature was 19.9 At time 2023/03/27-21:49:07 the room temperature was 19.9 At time 2023/03/27-22:09:07 the room temperature was 19.9 At time 2023/03/27-22:29:07 the room temperature was 19.8 At time 2023/03/27-22:49:07 the room temperature was 19.8 At time 2023/03/27-23:09:07 the room temperature was 19.8 At time 2023/03/27-23:29:07 the room temperature was 19.8 At time 2023/03/27-23:49:07 the room temperature was 19.8 At time 2023/03/28-00:09:07 the room temperature was 19.8 At time 2023/03/28-00:29:08 the room temperature was 19.7 At time 2023/03/28-00:49:07 the room temperature was 19.7 At time 2023/03/28-01:09:07 the room temperature was 19.6 At time 2023/03/28-01:29:07 the room temperature was 19.6 At time 2023/03/28-01:49:07 the room temperature was 19.6 At time 2023/03/28-02:09:07 the room temperature was 19.6 At time 2023/03/28-02:29:08 the room temperature was 19.5 At time 2023/03/28-02:49:07 the room temperature was 19.5 At time 2023/03/28-03:09:07 the room temperature was 19.5 At time 2023/03/28-03:29:07 the room temperature was 19.5.
"""

# Mã hóa đoạn văn bản
inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)

# Tạo ra tóm tắt
summary_ids = model.generate(inputs.input_ids, max_length=500, min_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)

# Giải mã kết quả để nhận được văn bản tóm tắt
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)


At time 2023/03/27-21:34:59 the room temperature was 20.0 At time 20th/March/27 -21:43:07 the room temp was 19.9 At time 2017-03-27-22:09:09The room temperature is 19.5% of the world’s population, according to the World Health Organization (WHO) 2017 World Population Survey (WPS) 2017-02-27The world population is expected to grow by 1.4 billion people over the next 10 years, with the majority of the population living in urbanised areas.The U.S. is projected to grow at a rate of 1.6% per year for the next decade, with urbanization accounting for more than half of the global population by 2050.According to the WHO, the world population will grow by 2.5 billion people by 2050, and urbanization will be the largest contributor to global population growth.The WPS estimates that the average annual global temperature will increase to 20.5 degrees Celsius by 2023, and the average global greenhouse gas emissions (GHG) will fall to 0.8g per year.The global GHG emissions are projected to fall to 1

In [43]:
from transformers import MvpTokenizer, MvpForConditionalGeneration

tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mtl-data-to-text")

inputs = tokenizer(
    """ Describe the following data: Timestamp|2023/03/27-21:34:59 room_temperature|20.0 Timestamp|2023/03/27-21:43:07 room_temperature|19.9 Timestamp|2023/03/27-21:49:07 room_temperature|19.9 Timestamp|2023/03/27-22:09:07 room_temperature|19.9 Timestamp|2023/03/27-22:29:07 room_temperature|19.8 Timestamp|2023/03/27-22:49:07 room_temperature|19.8 Timestamp|2023/03/27-23:09:07 room_temperature|19.8 Timestamp|2023/03/27-23:29:07 room_temperature|19.8 Timestamp|2023/03/27-23:49:07 room_temperature|19.8 Timestamp|2023/03/28-00:09:07 room_temperature|19.8 Timestamp|2023/03/28-00:29:08 room_temperature|19.7 Timestamp|2023/03/28-00:49:07 room_temperature|19.7 Timestamp|2023/03/28-01:09:07 room_temperature|19.6 Timestamp|2023/03/28-01:29:07 room_temperature|19.6 Timestamp|2023/03/28-01:49:07 room_temperature|19.6 Timestamp|2023/03/28-02:09:07 room_temperature|19.6 Timestamp|2023/03/28-02:29:08 room_temperature|19.5 Timestamp|2023/03/28-02:49:07 room_temperature|19.5 Timestamp|2023/03/28-03:09:07 room_temperature|19.5 Timestamp|2023/03/28-03:29:07 room_temperature|19.5""",
    return_tensors="pt",
)
generated_ids = model.generate(**inputs, max_length=512, min_length=100)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

['Room Temperature range from 20.0 °C to 20.9 °C Timestamp from 2023/03/27-21:34:59 Room temperature range from 19.9 to 19.5 °C timestamped from 20 to 21:43:07 Room Temperature range between 19.8 °C and 19.6 °C TIMESTAMP from 20 23 March to 21 March Room Temperature ranges from 19°C to 19°F Timestamped between 20.8°C and 20.6°C']

In [7]:
import pandas as pd
from datetime import datetime

# Bước 1: Đọc dữ liệu từ file Excel
file_path = '../dataset/Book1.xlsx'
df = pd.read_excel(file_path)
df = df.fillna(-1)

# Bước 2: Chuyển đổi timestamp thành dạng ngày giờ có thể đọc được
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s').dt.strftime('%Y/%m/%d-%H:%M:%S')

# Bước 3: Định dạng mỗi dòng dữ liệu
formatted_data_new = df.apply(lambda x: f"Timestamp|{x['timestamp']}|x|line_chart room_temperature_Ac-2-1|{x['room_temp_Ac-2-1']}|y|line_chart", axis=1)

# Bước 4: Gộp tất cả dữ liệu thành một dòng duy nhất, cách nhau bởi dấu cách
single_line_format = ' '.join(formatted_data_new)

# In ra kết quả (chỉ một phần để kiểm tra)
print(single_line_format[:500])  # In ra 500 ký tự đầu tiên để kiểm tra


Timestamp|2023/03/27-21:34:59|x|line_chart room_temperature_Ac-2-1|20.0|y|line_chart Timestamp|2023/03/27-21:43:07|x|line_chart room_temperature_Ac-2-1|19.9|y|line_chart Timestamp|2023/03/27-21:49:07|x|line_chart room_temperature_Ac-2-1|19.9|y|line_chart Timestamp|2023/03/27-22:09:07|x|line_chart room_temperature_Ac-2-1|19.9|y|line_chart Timestamp|2023/03/27-22:29:07|x|line_chart room_temperature_Ac-2-1|19.8|y|line_chart Timestamp|2023/03/27-22:49:07|x|line_chart room_temperature_Ac-2-1|19.8|y|l


In [8]:
# File path for the output text file
output_file_path = '../dataset/formatted_room_temperature_data.txt'

# Writing the single line formatted data to a text file
with open(output_file_path, 'w') as file:
    file.write(single_line_format)

output_file_path

'../dataset/formatted_room_temperature_data.txt'

In [40]:
import pandas as pd
from datetime import datetime

# Bước 1: Đọc dữ liệu từ file Excel
file_path = '../dataset/Book2.xlsx'
df = pd.read_excel(file_path)
df = df.fillna(-1)

# Bước 2: Chuyển đổi timestamp thành dạng ngày giờ có thể đọc được
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s').dt.strftime('%Y/%m/%d-%H:%M:%S')

# Bước 3: Định dạng mỗi dòng dữ liệu
formatted_data_new = df.apply(lambda x: f"At time {x['timestamp']} the room temperature was {x['room_temp_Ac-2-1']}", axis=1)

# Bước 4: Gộp tất cả dữ liệu thành một dòng duy nhất, cách nhau bởi dấu cách
single_line_format = ' '.join(formatted_data_new)

# In ra kết quả (chỉ một phần để kiểm tra)
print(single_line_format[:500])  # In ra 500 ký tự đầu tiên để kiểm tra


At time 2023/03/27-21:34:59 the room temperature was 20.0 At time 2023/03/27-21:43:07 the room temperature was 19.9 At time 2023/03/27-21:49:07 the room temperature was 19.9 At time 2023/03/27-22:09:07 the room temperature was 19.9 At time 2023/03/27-22:29:07 the room temperature was 19.8 At time 2023/03/27-22:49:07 the room temperature was 19.8 At time 2023/03/27-23:09:07 the room temperature was 19.8 At time 2023/03/27-23:29:07 the room temperature was 19.8 At time 2023/03/27-23:49:07 the room


In [41]:
# File path for the output text file
output_file_path = '../dataset/formatted_data.txt'

# Writing the single line formatted data to a text file
with open(output_file_path, 'w') as file:
    file.write(single_line_format)

output_file_path

'../dataset/formatted_data.txt'

Code chuyển đổi file sang CSV cho Bart và T5 Train, Validation, Test

In [84]:
import pandas as pd

# File paths
title_file = '../Chart2Text/data/train/trainTitle.txt'
data_file = '../Chart2Text/data/train/trainData.txt'
summary_file = '../Chart2Text/data/train/trainOriginalSummary.txt'

# Load the first 50 lines of each file
def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    lines = [line.strip() for line in lines]
    return pd.DataFrame(lines, columns=['data'])

# Đọc dữ liệu từ các file
title_df = read_txt_file(title_file)
data_df = read_txt_file(data_file)
summary_df = read_txt_file(summary_file)

# Ghép cột title và data
combined_data = title_df['data'] + ": " + data_df['data']

# Tạo DataFrame mới từ cột data ghép và summary
combined_df = pd.DataFrame({
    'data': combined_data,
    'summary': summary_df['data']
})


combined_df.head()


Unnamed: 0,data,summary
0,Global spending on motorsports sponsorships 20...,This statistic shows the worldwide spending fo...
1,Fixed broadband internet subscription rate 201...,This statistic illustrates the fixed broadband...
2,Audi - operating profit 2002 to 2018: Year|201...,This statistic represents Audi 's operating pr...
3,Amazon Web Services : TTM revenue 2014 to 2019...,The statistic illustrates the TTM revenue of A...
4,Food retail sales growth in the United Kingdom...,"In 2016 , data forecast expected retail sales ..."


In [90]:
import pandas as pd

# File paths
title_file = '../Chart2Text/data/valid/validTitle.txt'
data_file = '../Chart2Text/data/valid/validData.txt'
summary_file = '../Chart2Text/data/valid/validOriginalSummary.txt'

# Load the first 50 lines of each file
def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    lines = [line.strip() for line in lines]
    return pd.DataFrame(lines, columns=['data'])

# Đọc dữ liệu từ các file
title_df = read_txt_file(title_file)
data_df = read_txt_file(data_file)
summary_df = read_txt_file(summary_file)

# Ghép cột title và data
combined_data = title_df['data'] + ": " + data_df['data']

# Tạo DataFrame mới từ cột data ghép và summary
combined_df = pd.DataFrame({
    'data': combined_data,
    'summary': summary_df['data']
})


combined_df.head()


Unnamed: 0,data,summary
0,Light truck sales in the United States 1980 to...,Light truck retail sales in the United States ...
1,Global TV audience/viewership of Olympic Winte...,The statistic shows the global audience of the...
2,Netflix : net income 2000 to 2019: Year|2019|x...,Video streaming giant Netflix had a total net ...
3,Google : quarterly net income 2003 to 2015: Fi...,This timeline shows Google 's quarterly net in...
4,Consumer spending in the United Kingdom ( UK )...,This statistic shows total domestic consumptio...


In [92]:
import pandas as pd

# File paths
title_file = '../Chart2Text/data/test/testTitle.txt'
data_file = '../Chart2Text/data/test/testData.txt'
summary_file = '../Chart2Text/data/test/testOriginalSummary.txt'

def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    lines = [line.strip() for line in lines]
    return pd.DataFrame(lines, columns=['data'])

# Đọc dữ liệu từ các file
title_df = read_txt_file(title_file)
data_df = read_txt_file(data_file)
# summary_df = read_txt_file(summary_file)

# Ghép cột title và data
combined_data = title_df['data'] + ": " + data_df['data']

# Tạo DataFrame mới từ cột data ghép và summary
combined_df = pd.DataFrame({
    'data': combined_data,
    'summary': summary_df['data']
})


combined_df.head()


Unnamed: 0,data,summary
0,Countries with the lowest fertility rate globa...,Light truck retail sales in the United States ...
1,Expenditure of affluent U.S. households on fee...,The statistic shows the global audience of the...
2,Quarterly average daily rate in hotels in New ...,Video streaming giant Netflix had a total net ...
3,Population age structure in metropolitan areas...,This timeline shows Google 's quarterly net in...
4,Average retail price for white sugar in Canada...,This statistic shows total domestic consumptio...


In [93]:
csv_file_path = '../dataset/bart-t5/combined_data_test_bart_t5.csv'
combined_df.to_csv(csv_file_path, index=False)

csv_file_path

'../dataset/bart-t5/combined_data_test_bart_t5.csv'